update the description of StableDiffusionXLDenoiseLoopWrapper

add to method to modular loader, copied from DiffusionPipeline, not tested yet
add block mappings to modular_diffusers.stable_diffusion_xl.__init__
2025-12-12 23:44:30 +08:00 · 2025-06-20 07:38:21 +02:00 · 2025-06-20 07:25:20 +02:00 · 2025-06-20 07:24:14 +02:00 · 2025-06-20 07:23:14 +02:00 · 2025-06-19 05:30:18 +02:00
47 changed files with 11977 additions and 88 deletions
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -34,10 +34,12 @@ from .utils import (
 _import_structure = {
    "configuration_utils": ["ConfigMixin"],
    "guiders": [],
    "hooks": [],
    "loaders": ["FromOriginalModelMixin"],
    "models": [],
    "pipelines": [],
    "modular_pipelines": [],
    "quantizers.quantization_config": [],
    "schedulers": [],
    "utils": [
@@ -130,12 +132,26 @@ except OptionalDependencyNotAvailable:
    _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 else:
    _import_structure["guiders"].extend(
        [
            "AdaptiveProjectedGuidance",
            "AutoGuidance",
            "ClassifierFreeGuidance",
            "ClassifierFreeZeroStarGuidance",
            "SkipLayerGuidance",
            "SmoothedEnergyGuidance",
            "TangentialClassifierFreeGuidance",
        ]
    )
    _import_structure["hooks"].extend(
        [
            "FasterCacheConfig",
            "HookRegistry",
            "PyramidAttentionBroadcastConfig",
            "LayerSkipConfig",
            "SmoothedEnergyGuidanceConfig",
            "apply_faster_cache",
            "apply_layer_skip",
            "apply_pyramid_attention_broadcast",
        ]
    )
@@ -245,6 +261,15 @@ else:
            "StableDiffusionMixin",
        ]
    )
    _import_structure["modular_pipelines"].extend(
        [
            "ModularLoader",
            "ModularPipeline",
            "ModularPipelineBlocks",
            "ComponentSpec",
            "ComponentsManager",
        ]
    )
    _import_structure["quantizers"] = ["DiffusersQuantizer"]
    _import_structure["schedulers"].extend(
        [
@@ -523,6 +548,24 @@ else:
        ]
    )
 try:
    if not (is_torch_available() and is_transformers_available()):
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    from .utils import dummy_torch_and_transformers_objects  # noqa F403
    _import_structure["utils.dummy_torch_and_transformers_objects"] = [
        name for name in dir(dummy_torch_and_transformers_objects) if not name.startswith("_")
    ]
 else:
    _import_structure["modular_pipelines"].extend(
        [
            "StableDiffusionXLAutoPipeline",
            "StableDiffusionXLModularLoader",
        ]
    )
 try:
    if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
        raise OptionalDependencyNotAvailable()
@@ -728,10 +771,22 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from .utils.dummy_pt_objects import *  # noqa F403
    else:
        from .guiders import (
            AdaptiveProjectedGuidance,
            AutoGuidance,
            ClassifierFreeGuidance,
            ClassifierFreeZeroStarGuidance,
            SkipLayerGuidance,
            SmoothedEnergyGuidance,
            TangentialClassifierFreeGuidance,
        )
        from .hooks import (
            FasterCacheConfig,
            HookRegistry,
            LayerSkipConfig,
            PyramidAttentionBroadcastConfig,
            SmoothedEnergyGuidanceConfig,
            apply_layer_skip,
            apply_faster_cache,
            apply_pyramid_attention_broadcast,
        )
@@ -839,6 +894,13 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            ScoreSdeVePipeline,
            StableDiffusionMixin,
        )
        from .modular_pipelines import (
            ModularLoader,
            ModularPipeline,
            ModularPipelineBlocks,
            ComponentSpec,
            ComponentsManager,
        )
        from .quantizers import DiffusersQuantizer
        from .schedulers import (
            AmusedScheduler,
@@ -1094,7 +1156,16 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WuerstchenDecoderPipeline,
            WuerstchenPriorPipeline,
        )
-
+    try:
        if not (is_torch_available() and is_transformers_available()):
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
        from .modular_pipelines import (
            StableDiffusionXLAutoPipeline,
            StableDiffusionXLModularLoader,
        )
    try:
        if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
            raise OptionalDependencyNotAvailable()
--- a/src/diffusers/guiders/init.py
+++ b/src/diffusers/guiders/init.py
@@ -0,0 +1,29 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union
 from ..utils import is_torch_available
 if is_torch_available():
    from .adaptive_projected_guidance import AdaptiveProjectedGuidance
    from .auto_guidance import AutoGuidance
    from .classifier_free_guidance import ClassifierFreeGuidance
    from .classifier_free_zero_star_guidance import ClassifierFreeZeroStarGuidance
    from .skip_layer_guidance import SkipLayerGuidance
    from .smoothed_energy_guidance import SmoothedEnergyGuidance
    from .tangential_classifier_free_guidance import TangentialClassifierFreeGuidance
    GuiderType = Union[AdaptiveProjectedGuidance, AutoGuidance, ClassifierFreeGuidance, ClassifierFreeZeroStarGuidance, SkipLayerGuidance, SmoothedEnergyGuidance, TangentialClassifierFreeGuidance]
--- a/src/diffusers/guiders/adaptive_projected_guidance.py
+++ b/src/diffusers/guiders/adaptive_projected_guidance.py
@@ -0,0 +1,184 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import Optional, List, TYPE_CHECKING, Dict, Union, Tuple
 import torch
 from .guider_utils import BaseGuidance, rescale_noise_cfg
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 class AdaptiveProjectedGuidance(BaseGuidance):
    """
    Adaptive Projected Guidance (APG): https://huggingface.co/papers/2410.02416
    Args:
        guidance_scale (`float`, defaults to `7.5`):
            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
            deterioration of image quality.
        adaptive_projected_guidance_momentum (`float`, defaults to `None`):
            The momentum parameter for the adaptive projected guidance. Disabled if set to `None`.
        adaptive_projected_guidance_rescale (`float`, defaults to `15.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
        guidance_rescale (`float`, defaults to `0.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
            Flawed](https://huggingface.co/papers/2305.08891).
        use_original_formulation (`bool`, defaults to `False`):
            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
            we use the diffusers-native implementation that has been in the codebase for a long time. See
            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
        start (`float`, defaults to `0.0`):
            The fraction of the total number of denoising steps after which guidance starts.
        stop (`float`, defaults to `1.0`):
            The fraction of the total number of denoising steps after which guidance stops.
    """
    _input_predictions = ["pred_cond", "pred_uncond"]
    def __init__(
        self,
        guidance_scale: float = 7.5,
        adaptive_projected_guidance_momentum: Optional[float] = None,
        adaptive_projected_guidance_rescale: float = 15.0,
        eta: float = 1.0,
        guidance_rescale: float = 0.0,
        use_original_formulation: bool = False,
        start: float = 0.0,
        stop: float = 1.0,
    ):
        super().__init__(start, stop)
        self.guidance_scale = guidance_scale
        self.adaptive_projected_guidance_momentum = adaptive_projected_guidance_momentum
        self.adaptive_projected_guidance_rescale = adaptive_projected_guidance_rescale
        self.eta = eta
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation
        self.momentum_buffer = None
    def prepare_inputs(self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None) -> List["BlockState"]:
        if input_fields is None:
            input_fields = self._input_fields
        if self._step == 0:
            if self.adaptive_projected_guidance_momentum is not None:
                self.momentum_buffer = MomentumBuffer(self.adaptive_projected_guidance_momentum)
        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
        data_batches = []
        for i in range(self.num_conditions):
            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], self._input_predictions[i])
            data_batches.append(data_batch)
        return data_batches
    def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> torch.Tensor:
        pred = None
        if not self._is_apg_enabled():
            pred = pred_cond
        else:
            pred = normalized_guidance(
                pred_cond,
                pred_uncond,
                self.guidance_scale,
                self.momentum_buffer,
                self.eta,
                self.adaptive_projected_guidance_rescale,
                self.use_original_formulation,
            )
        if self.guidance_rescale > 0.0:
            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
        return pred, {}
    @property
    def is_conditional(self) -> bool:
        return self._count_prepared == 1
    @property
    def num_conditions(self) -> int:
        num_conditions = 1
        if self._is_apg_enabled():
            num_conditions += 1
        return num_conditions
    def _is_apg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self._start * self._num_inference_steps)
            skip_stop_step = int(self._stop * self._num_inference_steps)
            is_within_range = skip_start_step <= self._step < skip_stop_step
        is_close = False
        if self.use_original_formulation:
            is_close = math.isclose(self.guidance_scale, 0.0)
        else:
            is_close = math.isclose(self.guidance_scale, 1.0)
        return is_within_range and not is_close
 class MomentumBuffer:
    def __init__(self, momentum: float):
        self.momentum = momentum
        self.running_average = 0
    def update(self, update_value: torch.Tensor):
        new_average = self.momentum * self.running_average
        self.running_average = update_value + new_average
 def normalized_guidance(
    pred_cond: torch.Tensor,
    pred_uncond: torch.Tensor,
    guidance_scale: float,
    momentum_buffer: Optional[MomentumBuffer] = None,
    eta: float = 1.0,
    norm_threshold: float = 0.0,
    use_original_formulation: bool = False,
 ):
    diff = pred_cond - pred_uncond
    dim = [-i for i in range(1, len(diff.shape))]
    if momentum_buffer is not None:
        momentum_buffer.update(diff)
        diff = momentum_buffer.running_average
    if norm_threshold > 0:
        ones = torch.ones_like(diff)
        diff_norm = diff.norm(p=2, dim=dim, keepdim=True)
        scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
        diff = diff * scale_factor
    v0, v1 = diff.double(), pred_cond.double()
    v1 = torch.nn.functional.normalize(v1, dim=dim)
    v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
    v0_orthogonal = v0 - v0_parallel
    diff_parallel, diff_orthogonal = v0_parallel.type_as(diff), v0_orthogonal.type_as(diff)
    normalized_update = diff_orthogonal + eta * diff_parallel
    pred = pred_cond if use_original_formulation else pred_uncond
    pred = pred + guidance_scale * normalized_update
    return pred
--- a/src/diffusers/guiders/auto_guidance.py
+++ b/src/diffusers/guiders/auto_guidance.py
@@ -0,0 +1,177 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import List, Optional, Union, TYPE_CHECKING, Dict, Tuple
 import torch
 from ..hooks import HookRegistry, LayerSkipConfig
 from ..hooks.layer_skip import _apply_layer_skip_hook
 from .guider_utils import BaseGuidance, rescale_noise_cfg
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 class AutoGuidance(BaseGuidance):
    """
    AutoGuidance: https://huggingface.co/papers/2406.02507
    Args:
        guidance_scale (`float`, defaults to `7.5`):
            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
            deterioration of image quality.
        auto_guidance_layers (`int` or `List[int]`, *optional*):
            The layer indices to apply skip layer guidance to. Can be a single integer or a list of integers. If not
            provided, `skip_layer_config` must be provided.
        auto_guidance_config (`LayerSkipConfig` or `List[LayerSkipConfig]`, *optional*):
            The configuration for the skip layer guidance. Can be a single `LayerSkipConfig` or a list of
            `LayerSkipConfig`. If not provided, `skip_layer_guidance_layers` must be provided.
        dropout (`float`, *optional*):
            The dropout probability for autoguidance on the enabled skip layers (either with `auto_guidance_layers` or
            `auto_guidance_config`). If not provided, the dropout probability will be set to 1.0.
        guidance_rescale (`float`, defaults to `0.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
            Flawed](https://huggingface.co/papers/2305.08891).
        use_original_formulation (`bool`, defaults to `False`):
            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
            we use the diffusers-native implementation that has been in the codebase for a long time. See
            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
        start (`float`, defaults to `0.0`):
            The fraction of the total number of denoising steps after which guidance starts.
        stop (`float`, defaults to `1.0`):
            The fraction of the total number of denoising steps after which guidance stops.
    """
    _input_predictions = ["pred_cond", "pred_uncond"]
    def __init__(
        self,
        guidance_scale: float = 7.5,
        auto_guidance_layers: Optional[Union[int, List[int]]] = None,
        auto_guidance_config: Union[LayerSkipConfig, List[LayerSkipConfig]] = None,
        dropout: Optional[float] = None,
        guidance_rescale: float = 0.0,
        use_original_formulation: bool = False,
        start: float = 0.0,
        stop: float = 1.0,
    ):
        super().__init__(start, stop)
        self.guidance_scale = guidance_scale
        self.auto_guidance_layers = auto_guidance_layers
        self.auto_guidance_config = auto_guidance_config
        self.dropout = dropout
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation
        if auto_guidance_layers is None and auto_guidance_config is None:
            raise ValueError(
                "Either `auto_guidance_layers` or `auto_guidance_config` must be provided to enable Skip Layer Guidance."
            )
        if auto_guidance_layers is not None and auto_guidance_config is not None:
            raise ValueError("Only one of `auto_guidance_layers` or `auto_guidance_config` can be provided.")
        if (dropout is None and auto_guidance_layers is not None) or (dropout is not None and auto_guidance_layers is None):
            raise ValueError("`dropout` must be provided if `auto_guidance_layers` is provided.")
        if auto_guidance_layers is not None:
            if isinstance(auto_guidance_layers, int):
                auto_guidance_layers = [auto_guidance_layers]
            if not isinstance(auto_guidance_layers, list):
                raise ValueError(
                    f"Expected `auto_guidance_layers` to be an int or a list of ints, but got {type(auto_guidance_layers)}."
                )
            auto_guidance_config = [LayerSkipConfig(layer, fqn="auto", dropout=dropout) for layer in auto_guidance_layers]
        if isinstance(auto_guidance_config, LayerSkipConfig):
            auto_guidance_config = [auto_guidance_config]
        if not isinstance(auto_guidance_config, list):
            raise ValueError(
                f"Expected `auto_guidance_config` to be a LayerSkipConfig or a list of LayerSkipConfig, but got {type(auto_guidance_config)}."
            )
        self.auto_guidance_config = auto_guidance_config
        self._auto_guidance_hook_names = [f"AutoGuidance_{i}" for i in range(len(self.auto_guidance_config))]
    def prepare_models(self, denoiser: torch.nn.Module) -> None:
        self._count_prepared += 1
        if self._is_ag_enabled() and self.is_unconditional:
            for name, config in zip(self._auto_guidance_hook_names, self.auto_guidance_config):
                _apply_layer_skip_hook(denoiser, config, name=name)
    def cleanup_models(self, denoiser: torch.nn.Module) -> None:
        if self._is_ag_enabled() and self.is_unconditional:
            for name in self._auto_guidance_hook_names:
                registry = HookRegistry.check_if_exists_or_initialize(denoiser)
                registry.remove_hook(name, recurse=True)
    def prepare_inputs(self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None) -> List["BlockState"]:
        if input_fields is None:
            input_fields = self._input_fields
        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
        data_batches = []
        for i in range(self.num_conditions):
            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], self._input_predictions[i])
            data_batches.append(data_batch)
        return data_batches
    def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> torch.Tensor:
        pred = None
        if not self._is_ag_enabled():
            pred = pred_cond
        else:
            shift = pred_cond - pred_uncond
            pred = pred_cond if self.use_original_formulation else pred_uncond
            pred = pred + self.guidance_scale * shift
        if self.guidance_rescale > 0.0:
            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
        return pred, {}
    @property
    def is_conditional(self) -> bool:
        return self._count_prepared == 1
    @property
    def num_conditions(self) -> int:
        num_conditions = 1
        if self._is_ag_enabled():
            num_conditions += 1
        return num_conditions
    def _is_ag_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self._start * self._num_inference_steps)
            skip_stop_step = int(self._stop * self._num_inference_steps)
            is_within_range = skip_start_step <= self._step < skip_stop_step
        is_close = False
        if self.use_original_formulation:
            is_close = math.isclose(self.guidance_scale, 0.0)
        else:
            is_close = math.isclose(self.guidance_scale, 1.0)
        return is_within_range and not is_close
--- a/src/diffusers/guiders/classifier_free_guidance.py
+++ b/src/diffusers/guiders/classifier_free_guidance.py
@@ -0,0 +1,132 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import Optional, List, TYPE_CHECKING, Dict, Union, Tuple
 import torch
 from .guider_utils import BaseGuidance, rescale_noise_cfg
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 class ClassifierFreeGuidance(BaseGuidance):
    """
    Classifier-free guidance (CFG): https://huggingface.co/papers/2207.12598
    CFG is a technique used to improve generation quality and condition-following in diffusion models. It works by
    jointly training a model on both conditional and unconditional data, and using a weighted sum of the two during
    inference. This allows the model to tradeoff between generation quality and sample diversity.
    The original paper proposes scaling and shifting the conditional distribution based on the difference between
    conditional and unconditional predictions. [x_pred = x_cond + scale * (x_cond - x_uncond)]
    Diffusers implemented the scaling and shifting on the unconditional prediction instead based on the [Imagen
    paper](https://huggingface.co/papers/2205.11487), which is equivalent to what the original paper proposed in
    theory. [x_pred = x_uncond + scale * (x_cond - x_uncond)]
    The intution behind the original formulation can be thought of as moving the conditional distribution estimates
    further away from the unconditional distribution estimates, while the diffusers-native implementation can be
    thought of as moving the unconditional distribution towards the conditional distribution estimates to get rid of
    the unconditional predictions (usually negative features like "bad quality, bad anotomy, watermarks", etc.)
    The `use_original_formulation` argument can be set to `True` to use the original CFG formulation mentioned in the
    paper. By default, we use the diffusers-native implementation that has been in the codebase for a long time.
    Args:
        guidance_scale (`float`, defaults to `7.5`):
            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
            deterioration of image quality.
        guidance_rescale (`float`, defaults to `0.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
            Flawed](https://huggingface.co/papers/2305.08891).
        use_original_formulation (`bool`, defaults to `False`):
            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
            we use the diffusers-native implementation that has been in the codebase for a long time. See
            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
        start (`float`, defaults to `0.0`):
            The fraction of the total number of denoising steps after which guidance starts.
        stop (`float`, defaults to `1.0`):
            The fraction of the total number of denoising steps after which guidance stops.
    """
    _input_predictions = ["pred_cond", "pred_uncond"]
    def __init__(
        self, guidance_scale: float = 7.5, guidance_rescale: float = 0.0, use_original_formulation: bool = False, start: float = 0.0, stop: float = 1.0
    ):
        super().__init__(start, stop)
        self.guidance_scale = guidance_scale
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation
    def prepare_inputs(self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None) -> List["BlockState"]:
        if input_fields is None:
            input_fields = self._input_fields
        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
        data_batches = []
        for i in range(self.num_conditions):
            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], self._input_predictions[i])
            data_batches.append(data_batch)
        return data_batches
    def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> torch.Tensor:
        pred = None
        if not self._is_cfg_enabled():
            pred = pred_cond
        else:
            shift = pred_cond - pred_uncond
            pred = pred_cond if self.use_original_formulation else pred_uncond
            pred = pred + self.guidance_scale * shift
        if self.guidance_rescale > 0.0:
            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
        return pred, {}
    @property
    def is_conditional(self) -> bool:
        return self._count_prepared == 1
    @property
    def num_conditions(self) -> int:
        num_conditions = 1
        if self._is_cfg_enabled():
            num_conditions += 1
        return num_conditions
    def _is_cfg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self._start * self._num_inference_steps)
            skip_stop_step = int(self._stop * self._num_inference_steps)
            is_within_range = skip_start_step <= self._step < skip_stop_step
        is_close = False
        if self.use_original_formulation:
            is_close = math.isclose(self.guidance_scale, 0.0)
        else:
            is_close = math.isclose(self.guidance_scale, 1.0)
        return is_within_range and not is_close
--- a/src/diffusers/guiders/classifier_free_zero_star_guidance.py
+++ b/src/diffusers/guiders/classifier_free_zero_star_guidance.py
@@ -0,0 +1,148 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import Optional, List, TYPE_CHECKING, Dict, Union, Tuple
 import torch
 from .guider_utils import BaseGuidance, rescale_noise_cfg
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 class ClassifierFreeZeroStarGuidance(BaseGuidance):
    """
    Classifier-free Zero* (CFG-Zero*): https://huggingface.co/papers/2503.18886
    This is an implementation of the Classifier-Free Zero* guidance technique, which is a variant of classifier-free
    guidance. It proposes zero initialization of the noise predictions for the first few steps of the diffusion
    process, and also introduces an optimal rescaling factor for the noise predictions, which can help in improving the
    quality of generated images.
    The authors of the paper suggest setting zero initialization in the first 4% of the inference steps.
    Args:
        guidance_scale (`float`, defaults to `7.5`):
            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
            deterioration of image quality.
        zero_init_steps (`int`, defaults to `1`):
            The number of inference steps for which the noise predictions are zeroed out (see Section 4.2).
        guidance_rescale (`float`, defaults to `0.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
            Flawed](https://huggingface.co/papers/2305.08891).
        use_original_formulation (`bool`, defaults to `False`):
            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
            we use the diffusers-native implementation that has been in the codebase for a long time. See
            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
        start (`float`, defaults to `0.01`):
            The fraction of the total number of denoising steps after which guidance starts.
        stop (`float`, defaults to `0.2`):
            The fraction of the total number of denoising steps after which guidance stops.
    """
    _input_predictions = ["pred_cond", "pred_uncond"]
    def __init__(
        self,
        guidance_scale: float = 7.5,
        zero_init_steps: int = 1,
        guidance_rescale: float = 0.0,
        use_original_formulation: bool = False,
        start: float = 0.0,
        stop: float = 1.0,
    ):
        super().__init__(start, stop)
        self.guidance_scale = guidance_scale
        self.zero_init_steps = zero_init_steps
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation
    def prepare_inputs(self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None) -> List["BlockState"]:
        if input_fields is None:
            input_fields = self._input_fields
        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
        data_batches = []
        for i in range(self.num_conditions):
            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], self._input_predictions[i])
            data_batches.append(data_batch)
        return data_batches
    def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> torch.Tensor:
        pred = None
        if self._step < self.zero_init_steps:
            pred = torch.zeros_like(pred_cond)
        elif not self._is_cfg_enabled():
            pred = pred_cond
        else:
            pred_cond_flat = pred_cond.flatten(1)
            pred_uncond_flat = pred_uncond.flatten(1)
            alpha = cfg_zero_star_scale(pred_cond_flat, pred_uncond_flat)
            alpha = alpha.view(-1, *(1,) * (len(pred_cond.shape) - 1))
            pred_uncond = pred_uncond * alpha
            shift = pred_cond - pred_uncond
            pred = pred_cond if self.use_original_formulation else pred_uncond
            pred = pred + self.guidance_scale * shift
        if self.guidance_rescale > 0.0:
            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
        return pred, {}
    @property
    def is_conditional(self) -> bool:
        return self._count_prepared == 1
    @property
    def num_conditions(self) -> int:
        num_conditions = 1
        if self._is_cfg_enabled():
            num_conditions += 1
        return num_conditions
    def _is_cfg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self._start * self._num_inference_steps)
            skip_stop_step = int(self._stop * self._num_inference_steps)
            is_within_range = skip_start_step <= self._step < skip_stop_step
        is_close = False
        if self.use_original_formulation:
            is_close = math.isclose(self.guidance_scale, 0.0)
        else:
            is_close = math.isclose(self.guidance_scale, 1.0)
        return is_within_range and not is_close
 def cfg_zero_star_scale(cond: torch.Tensor, uncond: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
    cond_dtype = cond.dtype
    cond = cond.float()
    uncond = uncond.float()
    dot_product = torch.sum(cond * uncond, dim=1, keepdim=True)
    squared_norm = torch.sum(uncond**2, dim=1, keepdim=True) + eps
    # st_star = v_cond^T * v_uncond / ||v_uncond||^2
    scale = dot_product / squared_norm
    return scale.to(dtype=cond_dtype)
--- a/src/diffusers/guiders/entropy_rectifying_guidance.py
+++ b/src/diffusers/guiders/entropy_rectifying_guidance.py
--- a/src/diffusers/guiders/guider_utils.py
+++ b/src/diffusers/guiders/guider_utils.py
@@ -0,0 +1,215 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
 import torch
 from ..utils import get_logger
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 logger = get_logger(__name__)  # pylint: disable=invalid-name
 class BaseGuidance:
    r"""Base class providing the skeleton for implementing guidance techniques."""
    _input_predictions = None
    _identifier_key = "__guidance_identifier__"
    def __init__(self, start: float = 0.0, stop: float = 1.0):
        self._start = start
        self._stop = stop
        self._step: int = None
        self._num_inference_steps: int = None
        self._timestep: torch.LongTensor = None
        self._count_prepared = 0
        self._input_fields: Dict[str, Union[str, Tuple[str, str]]] = None
        self._enabled = True
        if not (0.0 <= start < 1.0):
            raise ValueError(
                f"Expected `start` to be between 0.0 and 1.0, but got {start}."
            )
        if not (start <= stop <= 1.0):
            raise ValueError(
                f"Expected `stop` to be between {start} and 1.0, but got {stop}."
            )
        if self._input_predictions is None or not isinstance(self._input_predictions, list):
            raise ValueError(
                "`_input_predictions` must be a list of required prediction names for the guidance technique."
            )
    def disable(self):
        self._enabled = False
    def enable(self):
        self._enabled = True
    def set_state(self, step: int, num_inference_steps: int, timestep: torch.LongTensor) -> None:
        self._step = step
        self._num_inference_steps = num_inference_steps
        self._timestep = timestep
        self._count_prepared = 0
    def set_input_fields(self, **kwargs: Dict[str, Union[str, Tuple[str, str]]]) -> None:
        """
        Set the input fields for the guidance technique. The input fields are used to specify the names of the
        returned attributes containing the prepared data after `prepare_inputs` is called. The prepared data is
        obtained from the values of the provided keyword arguments to this method.
        Args:
            **kwargs (`Dict[str, Union[str, Tuple[str, str]]]`):
                A dictionary where the keys are the names of the fields that will be used to store the data once
                it is prepared with `prepare_inputs`. The values can be either a string or a tuple of length 2,
                which is used to look up the required data provided for preparation.
                If a string is provided, it will be used as the conditional data (or unconditional if used with
                a guidance method that requires it). If a tuple of length 2 is provided, the first element must
                be the conditional data identifier and the second element must be the unconditional data identifier
                or None.
                Example:
                ```
                data = {"prompt_embeds": <some tensor>, "negative_prompt_embeds": <some tensor>, "latents": <some tensor>}
                BaseGuidance.set_input_fields(
                    latents="latents",
                    prompt_embeds=("prompt_embeds", "negative_prompt_embeds"),
                )
                ```
        """
        for key, value in kwargs.items():
            is_string = isinstance(value, str)
            is_tuple_of_str_with_len_2 = isinstance(value, tuple) and len(value) == 2 and all(isinstance(v, str) for v in value)
            if not (is_string or is_tuple_of_str_with_len_2):
                raise ValueError(
                    f"Expected `set_input_fields` to be called with a string or a tuple of string with length 2, but got {type(value)} for key {key}."
                )
        self._input_fields = kwargs
    def prepare_models(self, denoiser: torch.nn.Module) -> None:
        """
        Prepares the models for the guidance technique on a given batch of data. This method should be overridden in
        subclasses to implement specific model preparation logic.
        """
        self._count_prepared += 1
    def cleanup_models(self, denoiser: torch.nn.Module) -> None:
        """
        Cleans up the models for the guidance technique after a given batch of data. This method should be overridden in
        subclasses to implement specific model cleanup logic. It is useful for removing any hooks or other stateful
        modifications made during `prepare_models`.
        """
        pass
    def prepare_inputs(self, data: "BlockState") -> List["BlockState"]:
        raise NotImplementedError("BaseGuidance::prepare_inputs must be implemented in subclasses.")
    def __call__(self, data: List["BlockState"]) -> Any:
        if not all(hasattr(d, "noise_pred") for d in data):
            raise ValueError("Expected all data to have `noise_pred` attribute.")
        if len(data) != self.num_conditions:
            raise ValueError(
                f"Expected {self.num_conditions} data items, but got {len(data)}. Please check the input data."
            )
        forward_inputs = {getattr(d, self._identifier_key): d.noise_pred for d in data}
        return self.forward(**forward_inputs)
    def forward(self, *args, **kwargs) -> Any:
        raise NotImplementedError("BaseGuidance::forward must be implemented in subclasses.")
    @property
    def is_conditional(self) -> bool:
        raise NotImplementedError("BaseGuidance::is_conditional must be implemented in subclasses.")
    @property
    def is_unconditional(self) -> bool:
        return not self.is_conditional
    @property
    def num_conditions(self) -> int:
        raise NotImplementedError("BaseGuidance::num_conditions must be implemented in subclasses.")
    @classmethod
    def _prepare_batch(cls, input_fields: Dict[str, Union[str, Tuple[str, str]]], data: "BlockState", tuple_index: int, identifier: str) -> "BlockState":
        """
        Prepares a batch of data for the guidance technique. This method is used in the `prepare_inputs` method of
        the `BaseGuidance` class. It prepares the batch based on the provided tuple index.
        Args:
            input_fields (`Dict[str, Union[str, Tuple[str, str]]]`):
                A dictionary where the keys are the names of the fields that will be used to store the data once
                it is prepared with `prepare_inputs`. The values can be either a string or a tuple of length 2,
                which is used to look up the required data provided for preparation.
                If a string is provided, it will be used as the conditional data (or unconditional if used with
                a guidance method that requires it). If a tuple of length 2 is provided, the first element must
                be the conditional data identifier and the second element must be the unconditional data identifier
                or None.
            data (`BlockState`):
                The input data to be prepared.
            tuple_index (`int`):
                The index to use when accessing input fields that are tuples.
        Returns:
            `BlockState`: The prepared batch of data.
        """
        from ..modular_pipelines.modular_pipeline import BlockState
        if input_fields is None:
            raise ValueError("Input fields cannot be None. Please pass `input_fields` to `prepare_inputs` or call `set_input_fields` before preparing inputs.")
        data_batch = {}
        for key, value in input_fields.items():
            try:
                if isinstance(value, str):
                    data_batch[key] = getattr(data, value)
                elif isinstance(value, tuple):
                    data_batch[key] = getattr(data, value[tuple_index])
                else:
                    # We've already checked that value is a string or a tuple of strings with length 2
                    pass
            except AttributeError:
                logger.debug(f"`data` does not have attribute(s) {value}, skipping.")
        data_batch[cls._identifier_key] = identifier
        return BlockState(**data_batch)
 def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
    r"""
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://arxiv.org/pdf/2305.08891.pdf).
    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.
    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    """
    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
    # rescale the results from guidance (fixes overexposure)
    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
    return noise_cfg
--- a/src/diffusers/guiders/skip_layer_guidance.py
+++ b/src/diffusers/guiders/skip_layer_guidance.py
@@ -0,0 +1,251 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import List, Optional, Union, TYPE_CHECKING, Dict, Tuple
 import torch
 from ..hooks import HookRegistry, LayerSkipConfig
 from ..hooks.layer_skip import _apply_layer_skip_hook
 from .guider_utils import BaseGuidance, rescale_noise_cfg
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 class SkipLayerGuidance(BaseGuidance):
    """
    Skip Layer Guidance (SLG): https://github.com/Stability-AI/sd3.5
    Spatio-Temporal Guidance (STG): https://huggingface.co/papers/2411.18664
    SLG was introduced by StabilityAI for improving structure and anotomy coherence in generated images. It works by
    skipping the forward pass of specified transformer blocks during the denoising process on an additional conditional
    batch of data, apart from the conditional and unconditional batches already used in CFG
    ([~guiders.classifier_free_guidance.ClassifierFreeGuidance]), and then scaling and shifting the CFG predictions
    based on the difference between conditional without skipping and conditional with skipping predictions.
    The intution behind SLG can be thought of as moving the CFG predicted distribution estimates further away from
    worse versions of the conditional distribution estimates (because skipping layers is equivalent to using a worse
    version of the model for the conditional prediction).
    STG is an improvement and follow-up work combining ideas from SLG, PAG and similar techniques for improving
    generation quality in video diffusion models.
    Additional reading:
    - [Guiding a Diffusion Model with a Bad Version of Itself](https://huggingface.co/papers/2406.02507)
    The values for `skip_layer_guidance_scale`, `skip_layer_guidance_start`, and `skip_layer_guidance_stop` are
    defaulted to the recommendations by StabilityAI for Stable Diffusion 3.5 Medium.
    Args:
        guidance_scale (`float`, defaults to `7.5`):
            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
            deterioration of image quality.
        skip_layer_guidance_scale (`float`, defaults to `2.8`):
            The scale parameter for skip layer guidance. Anatomy and structure coherence may improve with higher
            values, but it may also lead to overexposure and saturation.
        skip_layer_guidance_start (`float`, defaults to `0.01`):
            The fraction of the total number of denoising steps after which skip layer guidance starts.
        skip_layer_guidance_stop (`float`, defaults to `0.2`):
            The fraction of the total number of denoising steps after which skip layer guidance stops.
        skip_layer_guidance_layers (`int` or `List[int]`, *optional*):
            The layer indices to apply skip layer guidance to. Can be a single integer or a list of integers. If not
            provided, `skip_layer_config` must be provided. The recommended values are `[7, 8, 9]` for Stable Diffusion
            3.5 Medium.
        skip_layer_config (`LayerSkipConfig` or `List[LayerSkipConfig]`, *optional*):
            The configuration for the skip layer guidance. Can be a single `LayerSkipConfig` or a list of
            `LayerSkipConfig`. If not provided, `skip_layer_guidance_layers` must be provided.
        guidance_rescale (`float`, defaults to `0.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
            Flawed](https://huggingface.co/papers/2305.08891).
        use_original_formulation (`bool`, defaults to `False`):
            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
            we use the diffusers-native implementation that has been in the codebase for a long time. See
            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
        start (`float`, defaults to `0.01`):
            The fraction of the total number of denoising steps after which guidance starts.
        stop (`float`, defaults to `0.2`):
            The fraction of the total number of denoising steps after which guidance stops.
    """
    _input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
    def __init__(
        self,
        guidance_scale: float = 7.5,
        skip_layer_guidance_scale: float = 2.8,
        skip_layer_guidance_start: float = 0.01,
        skip_layer_guidance_stop: float = 0.2,
        skip_layer_guidance_layers: Optional[Union[int, List[int]]] = None,
        skip_layer_config: Union[LayerSkipConfig, List[LayerSkipConfig]] = None,
        guidance_rescale: float = 0.0,
        use_original_formulation: bool = False,
        start: float = 0.0,
        stop: float = 1.0,
    ):
        super().__init__(start, stop)
        self.guidance_scale = guidance_scale
        self.skip_layer_guidance_scale = skip_layer_guidance_scale
        self.skip_layer_guidance_start = skip_layer_guidance_start
        self.skip_layer_guidance_stop = skip_layer_guidance_stop
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation
        if not (0.0 <= skip_layer_guidance_start < 1.0):
            raise ValueError(
                f"Expected `skip_layer_guidance_start` to be between 0.0 and 1.0, but got {skip_layer_guidance_start}."
            )
        if not (skip_layer_guidance_start <= skip_layer_guidance_stop <= 1.0):
            raise ValueError(
                f"Expected `skip_layer_guidance_stop` to be between 0.0 and 1.0, but got {skip_layer_guidance_stop}."
            )
        if skip_layer_guidance_layers is None and skip_layer_config is None:
            raise ValueError(
                "Either `skip_layer_guidance_layers` or `skip_layer_config` must be provided to enable Skip Layer Guidance."
            )
        if skip_layer_guidance_layers is not None and skip_layer_config is not None:
            raise ValueError("Only one of `skip_layer_guidance_layers` or `skip_layer_config` can be provided.")
        if skip_layer_guidance_layers is not None:
            if isinstance(skip_layer_guidance_layers, int):
                skip_layer_guidance_layers = [skip_layer_guidance_layers]
            if not isinstance(skip_layer_guidance_layers, list):
                raise ValueError(
                    f"Expected `skip_layer_guidance_layers` to be an int or a list of ints, but got {type(skip_layer_guidance_layers)}."
                )
            skip_layer_config = [LayerSkipConfig(layer, fqn="auto") for layer in skip_layer_guidance_layers]
        if isinstance(skip_layer_config, LayerSkipConfig):
            skip_layer_config = [skip_layer_config]
        if not isinstance(skip_layer_config, list):
            raise ValueError(
                f"Expected `skip_layer_config` to be a LayerSkipConfig or a list of LayerSkipConfig, but got {type(skip_layer_config)}."
            )
        self.skip_layer_config = skip_layer_config
        self._skip_layer_hook_names = [f"SkipLayerGuidance_{i}" for i in range(len(self.skip_layer_config))]
    def prepare_models(self, denoiser: torch.nn.Module) -> None:
        self._count_prepared += 1
        if self._is_slg_enabled() and self.is_conditional and self._count_prepared > 1:
            for name, config in zip(self._skip_layer_hook_names, self.skip_layer_config):
                _apply_layer_skip_hook(denoiser, config, name=name)
    def cleanup_models(self, denoiser: torch.nn.Module) -> None:
        if self._is_slg_enabled() and self.is_conditional and self._count_prepared > 1:
            registry = HookRegistry.check_if_exists_or_initialize(denoiser)
            # Remove the hooks after inference
            for hook_name in self._skip_layer_hook_names:
                registry.remove_hook(hook_name, recurse=True)
    def prepare_inputs(self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None) -> List["BlockState"]:
        if input_fields is None:
            input_fields = self._input_fields
        if self.num_conditions == 1:
            tuple_indices = [0]
            input_predictions = ["pred_cond"]
        elif self.num_conditions == 2:
            tuple_indices = [0, 1]
            input_predictions = ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_skip"]
        else:
            tuple_indices = [0, 1, 0]
            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_skip"]
        data_batches = []
        for i in range(self.num_conditions):
            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], input_predictions[i])
            data_batches.append(data_batch)
        return data_batches
    def forward(
        self,
        pred_cond: torch.Tensor,
        pred_uncond: Optional[torch.Tensor] = None,
        pred_cond_skip: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        pred = None
        if not self._is_cfg_enabled() and not self._is_slg_enabled():
            pred = pred_cond
        elif not self._is_cfg_enabled():
            shift = pred_cond - pred_cond_skip
            pred = pred_cond if self.use_original_formulation else pred_cond_skip
            pred = pred + self.skip_layer_guidance_scale * shift
        elif not self._is_slg_enabled():
            shift = pred_cond - pred_uncond
            pred = pred_cond if self.use_original_formulation else pred_uncond
            pred = pred + self.guidance_scale * shift
        else:
            shift = pred_cond - pred_uncond
            shift_skip = pred_cond - pred_cond_skip
            pred = pred_cond if self.use_original_formulation else pred_uncond
            pred = pred + self.guidance_scale * shift + self.skip_layer_guidance_scale * shift_skip
        if self.guidance_rescale > 0.0:
            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
        return pred, {}
    @property
    def is_conditional(self) -> bool:
        return self._count_prepared == 1 or self._count_prepared == 3
    @property
    def num_conditions(self) -> int:
        num_conditions = 1
        if self._is_cfg_enabled():
            num_conditions += 1
        if self._is_slg_enabled():
            num_conditions += 1
        return num_conditions
    def _is_cfg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self._start * self._num_inference_steps)
            skip_stop_step = int(self._stop * self._num_inference_steps)
            is_within_range = skip_start_step <= self._step < skip_stop_step
        is_close = False
        if self.use_original_formulation:
            is_close = math.isclose(self.guidance_scale, 0.0)
        else:
            is_close = math.isclose(self.guidance_scale, 1.0)
        return is_within_range and not is_close
    def _is_slg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self.skip_layer_guidance_start * self._num_inference_steps)
            skip_stop_step = int(self.skip_layer_guidance_stop * self._num_inference_steps)
            is_within_range = skip_start_step < self._step < skip_stop_step
        is_zero = math.isclose(self.skip_layer_guidance_scale, 0.0)
        return is_within_range and not is_zero
--- a/src/diffusers/guiders/smoothed_energy_guidance.py
+++ b/src/diffusers/guiders/smoothed_energy_guidance.py
@@ -0,0 +1,244 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import List, Optional, Union, TYPE_CHECKING, Dict, Tuple
 import torch
 from ..hooks import HookRegistry
 from ..hooks.smoothed_energy_guidance_utils import SmoothedEnergyGuidanceConfig, _apply_smoothed_energy_guidance_hook
 from .guider_utils import BaseGuidance, rescale_noise_cfg
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 class SmoothedEnergyGuidance(BaseGuidance):
    """
    Smoothed Energy Guidance (SEG): https://huggingface.co/papers/2408.00760
    SEG is only supported as an experimental prototype feature for now, so the implementation may be modified
    in the future without warning or guarantee of reproducibility. This implementation assumes:
    - Generated images are square (height == width)
    - The model does not combine different modalities together (e.g., text and image latent streams are
      not combined together such as Flux)
    Args:
        guidance_scale (`float`, defaults to `7.5`):
            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
            deterioration of image quality.
        seg_guidance_scale (`float`, defaults to `3.0`):
            The scale parameter for smoothed energy guidance. Anatomy and structure coherence may improve with higher
            values, but it may also lead to overexposure and saturation.
        seg_blur_sigma (`float`, defaults to `9999999.0`):
            The amount by which we blur the attention weights. Setting this value greater than 9999.0 results in
            infinite blur, which means uniform queries. Controlling it exponentially is empirically effective.
        seg_blur_threshold_inf (`float`, defaults to `9999.0`):
            The threshold above which the blur is considered infinite.
        seg_guidance_start (`float`, defaults to `0.0`):
            The fraction of the total number of denoising steps after which smoothed energy guidance starts.
        seg_guidance_stop (`float`, defaults to `1.0`):
            The fraction of the total number of denoising steps after which smoothed energy guidance stops.
        seg_guidance_layers (`int` or `List[int]`, *optional*):
            The layer indices to apply smoothed energy guidance to. Can be a single integer or a list of integers. If not
            provided, `seg_guidance_config` must be provided. The recommended values are `[7, 8, 9]` for Stable Diffusion
            3.5 Medium.
        seg_guidance_config (`SmoothedEnergyGuidanceConfig` or `List[SmoothedEnergyGuidanceConfig]`, *optional*):
            The configuration for the smoothed energy layer guidance. Can be a single `SmoothedEnergyGuidanceConfig` or a list of
            `SmoothedEnergyGuidanceConfig`. If not provided, `seg_guidance_layers` must be provided.
        guidance_rescale (`float`, defaults to `0.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
            Flawed](https://huggingface.co/papers/2305.08891).
        use_original_formulation (`bool`, defaults to `False`):
            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
            we use the diffusers-native implementation that has been in the codebase for a long time. See
            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
        start (`float`, defaults to `0.01`):
            The fraction of the total number of denoising steps after which guidance starts.
        stop (`float`, defaults to `0.2`):
            The fraction of the total number of denoising steps after which guidance stops.
    """
    _input_predictions = ["pred_cond", "pred_uncond", "pred_cond_seg"]
    def __init__(
        self,
        guidance_scale: float = 7.5,
        seg_guidance_scale: float = 2.8,
        seg_blur_sigma: float = 9999999.0,
        seg_blur_threshold_inf: float = 9999.0,
        seg_guidance_start: float = 0.0,
        seg_guidance_stop: float = 1.0,
        seg_guidance_layers: Optional[Union[int, List[int]]] = None,
        seg_guidance_config: Union[SmoothedEnergyGuidanceConfig, List[SmoothedEnergyGuidanceConfig]] = None,
        guidance_rescale: float = 0.0,
        use_original_formulation: bool = False,
        start: float = 0.0,
        stop: float = 1.0,
    ):
        super().__init__(start, stop)
        self.guidance_scale = guidance_scale
        self.seg_guidance_scale = seg_guidance_scale
        self.seg_blur_sigma = seg_blur_sigma
        self.seg_blur_threshold_inf = seg_blur_threshold_inf
        self.seg_guidance_start = seg_guidance_start
        self.seg_guidance_stop = seg_guidance_stop
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation
        if not (0.0 <= seg_guidance_start < 1.0):
            raise ValueError(
                f"Expected `seg_guidance_start` to be between 0.0 and 1.0, but got {seg_guidance_start}."
            )
        if not (seg_guidance_start <= seg_guidance_stop <= 1.0):
            raise ValueError(
                f"Expected `seg_guidance_stop` to be between 0.0 and 1.0, but got {seg_guidance_stop}."
            )
        if seg_guidance_layers is None and seg_guidance_config is None:
            raise ValueError(
                "Either `seg_guidance_layers` or `seg_guidance_config` must be provided to enable Smoothed Energy Guidance."
            )
        if seg_guidance_layers is not None and seg_guidance_config is not None:
            raise ValueError("Only one of `seg_guidance_layers` or `seg_guidance_config` can be provided.")
        if seg_guidance_layers is not None:
            if isinstance(seg_guidance_layers, int):
                seg_guidance_layers = [seg_guidance_layers]
            if not isinstance(seg_guidance_layers, list):
                raise ValueError(
                    f"Expected `seg_guidance_layers` to be an int or a list of ints, but got {type(seg_guidance_layers)}."
                )
            seg_guidance_config = [SmoothedEnergyGuidanceConfig(layer, fqn="auto") for layer in seg_guidance_layers]
        if isinstance(seg_guidance_config, SmoothedEnergyGuidanceConfig):
            seg_guidance_config = [seg_guidance_config]
        if not isinstance(seg_guidance_config, list):
            raise ValueError(
                f"Expected `seg_guidance_config` to be a SmoothedEnergyGuidanceConfig or a list of SmoothedEnergyGuidanceConfig, but got {type(seg_guidance_config)}."
            )
        self.seg_guidance_config = seg_guidance_config
        self._seg_layer_hook_names = [f"SmoothedEnergyGuidance_{i}" for i in range(len(self.seg_guidance_config))]
    def prepare_models(self, denoiser: torch.nn.Module) -> None:
        if self._is_seg_enabled() and self.is_conditional and self._count_prepared > 1:
            for name, config in zip(self._seg_layer_hook_names, self.seg_guidance_config):
                _apply_smoothed_energy_guidance_hook(denoiser, config, self.seg_blur_sigma, name=name)
    def cleanup_models(self, denoiser: torch.nn.Module):
        if self._is_seg_enabled() and self.is_conditional and self._count_prepared > 1:
            registry = HookRegistry.check_if_exists_or_initialize(denoiser)
            # Remove the hooks after inference
            for hook_name in self._seg_layer_hook_names:
                registry.remove_hook(hook_name, recurse=True)
    def prepare_inputs(self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None) -> List["BlockState"]:
        if input_fields is None:
            input_fields = self._input_fields
        if self.num_conditions == 1:
            tuple_indices = [0]
            input_predictions = ["pred_cond"]
        elif self.num_conditions == 2:
            tuple_indices = [0, 1]
            input_predictions = ["pred_cond", "pred_uncond"] if self._is_cfg_enabled() else ["pred_cond", "pred_cond_seg"]
        else:
            tuple_indices = [0, 1, 0]
            input_predictions = ["pred_cond", "pred_uncond", "pred_cond_seg"]
        data_batches = []
        for i in range(self.num_conditions):
            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], input_predictions[i])
            data_batches.append(data_batch)
        return data_batches
    def forward(
        self,
        pred_cond: torch.Tensor,
        pred_uncond: Optional[torch.Tensor] = None,
        pred_cond_seg: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        pred = None
        if not self._is_cfg_enabled() and not self._is_seg_enabled():
            pred = pred_cond
        elif not self._is_cfg_enabled():
            shift = pred_cond - pred_cond_seg
            pred = pred_cond if self.use_original_formulation else pred_cond_seg
            pred = pred + self.seg_guidance_scale * shift
        elif not self._is_seg_enabled():
            shift = pred_cond - pred_uncond
            pred = pred_cond if self.use_original_formulation else pred_uncond
            pred = pred + self.guidance_scale * shift
        else:
            shift = pred_cond - pred_uncond
            shift_seg = pred_cond - pred_cond_seg
            pred = pred_cond if self.use_original_formulation else pred_uncond
            pred = pred + self.guidance_scale * shift + self.seg_guidance_scale * shift_seg
        if self.guidance_rescale > 0.0:
            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
        return pred, {}
    @property
    def is_conditional(self) -> bool:
        return self._count_prepared == 1 or self._count_prepared == 3
    @property
    def num_conditions(self) -> int:
        num_conditions = 1
        if self._is_cfg_enabled():
            num_conditions += 1
        if self._is_seg_enabled():
            num_conditions += 1
        return num_conditions
    def _is_cfg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self._start * self._num_inference_steps)
            skip_stop_step = int(self._stop * self._num_inference_steps)
            is_within_range = skip_start_step <= self._step < skip_stop_step
        is_close = False
        if self.use_original_formulation:
            is_close = math.isclose(self.guidance_scale, 0.0)
        else:
            is_close = math.isclose(self.guidance_scale, 1.0)
        return is_within_range and not is_close
    def _is_seg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self.seg_guidance_start * self._num_inference_steps)
            skip_stop_step = int(self.seg_guidance_stop * self._num_inference_steps)
            is_within_range = skip_start_step < self._step < skip_stop_step
        is_zero = math.isclose(self.seg_guidance_scale, 0.0)
        return is_within_range and not is_zero
--- a/src/diffusers/guiders/tangential_classifier_free_guidance.py
+++ b/src/diffusers/guiders/tangential_classifier_free_guidance.py
@@ -0,0 +1,137 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from typing import Optional, List, TYPE_CHECKING, Dict, Union, Tuple
 import torch
 from .guider_utils import BaseGuidance, rescale_noise_cfg
 if TYPE_CHECKING:
    from ..modular_pipelines.modular_pipeline import BlockState
 class TangentialClassifierFreeGuidance(BaseGuidance):
    """
    Tangential Classifier Free Guidance (TCFG): https://huggingface.co/papers/2503.18137
    Args:
        guidance_scale (`float`, defaults to `7.5`):
            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
            deterioration of image quality.
        guidance_rescale (`float`, defaults to `0.0`):
            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
            Flawed](https://huggingface.co/papers/2305.08891).
        use_original_formulation (`bool`, defaults to `False`):
            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
            we use the diffusers-native implementation that has been in the codebase for a long time. See
            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
        start (`float`, defaults to `0.0`):
            The fraction of the total number of denoising steps after which guidance starts.
        stop (`float`, defaults to `1.0`):
            The fraction of the total number of denoising steps after which guidance stops.
    """
    _input_predictions = ["pred_cond", "pred_uncond"]
    def __init__(
        self,
        guidance_scale: float = 7.5,
        guidance_rescale: float = 0.0,
        use_original_formulation: bool = False,
        start: float = 0.0,
        stop: float = 1.0,
    ):
        super().__init__(start, stop)
        self.guidance_scale = guidance_scale
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation
    def prepare_inputs(self, data: "BlockState", input_fields: Optional[Dict[str, Union[str, Tuple[str, str]]]] = None) -> List["BlockState"]:
        if input_fields is None:
            input_fields = self._input_fields
        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
        data_batches = []
        for i in range(self.num_conditions):
            data_batch = self._prepare_batch(input_fields, data, tuple_indices[i], self._input_predictions[i])
            data_batches.append(data_batch)
        return data_batches
    def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> torch.Tensor:
        pred = None
        if not self._is_tcfg_enabled():
            pred = pred_cond
        else:
            pred = normalized_guidance(pred_cond, pred_uncond, self.guidance_scale, self.use_original_formulation)
        if self.guidance_rescale > 0.0:
            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
        return pred, {}
    @property
    def is_conditional(self) -> bool:
        return self._num_outputs_prepared == 1
    @property
    def num_conditions(self) -> int:
        num_conditions = 1
        if self._is_tcfg_enabled():
            num_conditions += 1
        return num_conditions
    def _is_tcfg_enabled(self) -> bool:
        if not self._enabled:
            return False
        is_within_range = True
        if self._num_inference_steps is not None:
            skip_start_step = int(self._start * self._num_inference_steps)
            skip_stop_step = int(self._stop * self._num_inference_steps)
            is_within_range = skip_start_step <= self._step < skip_stop_step
        is_close = False
        if self.use_original_formulation:
            is_close = math.isclose(self.guidance_scale, 0.0)
        else:
            is_close = math.isclose(self.guidance_scale, 1.0)
        return is_within_range and not is_close
 def normalized_guidance(pred_cond: torch.Tensor, pred_uncond: torch.Tensor, guidance_scale: float, use_original_formulation: bool = False) -> torch.Tensor:
    cond_dtype = pred_cond.dtype    
    preds = torch.stack([pred_cond, pred_uncond], dim=1).float()
    preds = preds.flatten(2)
    U, S, Vh = torch.linalg.svd(preds, full_matrices=False)
    Vh_modified = Vh.clone()
    Vh_modified[:, 1] = 0
    uncond_flat = pred_uncond.reshape(pred_uncond.size(0), 1, -1).float()
    x_Vh = torch.matmul(uncond_flat, Vh.transpose(-2, -1))
    x_Vh_V = torch.matmul(x_Vh, Vh_modified)
    pred_uncond = x_Vh_V.reshape(pred_uncond.shape).to(cond_dtype)
    pred = pred_cond if use_original_formulation else pred_uncond
    shift = pred_cond - pred_uncond
    pred = pred + guidance_scale * shift
    return pred
--- a/src/diffusers/hooks/init.py
+++ b/src/diffusers/hooks/init.py
@@ -5,5 +5,7 @@ if is_torch_available():
    from .faster_cache import FasterCacheConfig, apply_faster_cache
    from .group_offloading import apply_group_offloading
    from .hooks import HookRegistry, ModelHook
    from .layer_skip import LayerSkipConfig, apply_layer_skip
    from .layerwise_casting import apply_layerwise_casting, apply_layerwise_casting_hook
    from .pyramid_attention_broadcast import PyramidAttentionBroadcastConfig, apply_pyramid_attention_broadcast
    from .smoothed_energy_guidance_utils import SmoothedEnergyGuidanceConfig
--- a/src/diffusers/hooks/_common.py
+++ b/src/diffusers/hooks/_common.py
@@ -0,0 +1,43 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional
 import torch
 from ..models.attention import FeedForward, LuminaFeedForward
 from ..models.attention_processor import Attention, MochiAttention
 _ATTENTION_CLASSES = (Attention, MochiAttention)
 _FEEDFORWARD_CLASSES = (FeedForward, LuminaFeedForward)
 _SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "single_transformer_blocks", "layers")
 _TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS = ("temporal_transformer_blocks",)
 _CROSS_TRANSFORMER_BLOCK_IDENTIFIERS = ("blocks", "transformer_blocks", "layers")
 _ALL_TRANSFORMER_BLOCK_IDENTIFIERS = tuple(
    {
        *_SPATIAL_TRANSFORMER_BLOCK_IDENTIFIERS,
        *_TEMPORAL_TRANSFORMER_BLOCK_IDENTIFIERS,
        *_CROSS_TRANSFORMER_BLOCK_IDENTIFIERS,
    }
 )
 def _get_submodule_from_fqn(module: torch.nn.Module, fqn: str) -> Optional[torch.nn.Module]:
    for submodule_name, submodule in module.named_modules():
        if submodule_name == fqn:
            return submodule
    return None
--- a/src/diffusers/hooks/_helpers.py
+++ b/src/diffusers/hooks/_helpers.py
@@ -0,0 +1,271 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
 from typing import Any, Callable, Type
 from ..models.attention import BasicTransformerBlock
 from ..models.attention_processor import AttnProcessor2_0
 from ..models.transformers.cogvideox_transformer_3d import CogVideoXBlock
 from ..models.transformers.transformer_cogview4 import CogView4AttnProcessor, CogView4TransformerBlock
 from ..models.transformers.transformer_flux import FluxSingleTransformerBlock, FluxTransformerBlock
 from ..models.transformers.transformer_hunyuan_video import (
    HunyuanVideoSingleTransformerBlock,
    HunyuanVideoTokenReplaceSingleTransformerBlock,
    HunyuanVideoTokenReplaceTransformerBlock,
    HunyuanVideoTransformerBlock,
 )
 from ..models.transformers.transformer_ltx import LTXVideoTransformerBlock
 from ..models.transformers.transformer_mochi import MochiTransformerBlock
 from ..models.transformers.transformer_wan import WanTransformerBlock
@dataclass
 class AttentionProcessorMetadata:
    skip_processor_output_fn: Callable[[Any], Any]
@dataclass
 class TransformerBlockMetadata:
    skip_block_output_fn: Callable[[Any], Any]
    return_hidden_states_index: int = None
    return_encoder_hidden_states_index: int = None
 class AttentionProcessorRegistry:
    _registry = {}
    @classmethod
    def register(cls, model_class: Type, metadata: AttentionProcessorMetadata):
        cls._registry[model_class] = metadata
    @classmethod
    def get(cls, model_class: Type) -> AttentionProcessorMetadata:
        if model_class not in cls._registry:
            raise ValueError(f"Model class {model_class} not registered.")
        return cls._registry[model_class]
 class TransformerBlockRegistry:
    _registry = {}
    @classmethod
    def register(cls, model_class: Type, metadata: TransformerBlockMetadata):
        cls._registry[model_class] = metadata
    @classmethod
    def get(cls, model_class: Type) -> TransformerBlockMetadata:
        if model_class not in cls._registry:
            raise ValueError(f"Model class {model_class} not registered.")
        return cls._registry[model_class]
 def _register_attention_processors_metadata():
    # AttnProcessor2_0
    AttentionProcessorRegistry.register(
        model_class=AttnProcessor2_0,
        metadata=AttentionProcessorMetadata(
            skip_processor_output_fn=_skip_proc_output_fn_Attention_AttnProcessor2_0,
        ),
    )
    # CogView4AttnProcessor
    AttentionProcessorRegistry.register(
        model_class=CogView4AttnProcessor,
        metadata=AttentionProcessorMetadata(
            skip_processor_output_fn=_skip_proc_output_fn_Attention_CogView4AttnProcessor,
        ),
    )
 def _register_transformer_blocks_metadata():
    # BasicTransformerBlock
    TransformerBlockRegistry.register(
        model_class=BasicTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_BasicTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=None,
        ),
    )
    # CogVideoX
    TransformerBlockRegistry.register(
        model_class=CogVideoXBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_CogVideoXBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=1,
        ),
    )
    # CogView4
    TransformerBlockRegistry.register(
        model_class=CogView4TransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_CogView4TransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=1,
        ),
    )
    # Flux
    TransformerBlockRegistry.register(
        model_class=FluxTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_FluxTransformerBlock,
            return_hidden_states_index=1,
            return_encoder_hidden_states_index=0,
        ),
    )
    TransformerBlockRegistry.register(
        model_class=FluxSingleTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_FluxSingleTransformerBlock,
            return_hidden_states_index=1,
            return_encoder_hidden_states_index=0,
        ),
    )
    # HunyuanVideo
    TransformerBlockRegistry.register(
        model_class=HunyuanVideoTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_HunyuanVideoTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=1,
        ),
    )
    TransformerBlockRegistry.register(
        model_class=HunyuanVideoSingleTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_HunyuanVideoSingleTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=1,
        ),
    )
    TransformerBlockRegistry.register(
        model_class=HunyuanVideoTokenReplaceTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_HunyuanVideoTokenReplaceTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=1,
        ),
    )
    TransformerBlockRegistry.register(
        model_class=HunyuanVideoTokenReplaceSingleTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_HunyuanVideoTokenReplaceSingleTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=1,
        ),
    )
    # LTXVideo
    TransformerBlockRegistry.register(
        model_class=LTXVideoTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_LTXVideoTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=None,
        ),
    )
    # Mochi
    TransformerBlockRegistry.register(
        model_class=MochiTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_MochiTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=1,
        ),
    )
    # Wan
    TransformerBlockRegistry.register(
        model_class=WanTransformerBlock,
        metadata=TransformerBlockMetadata(
            skip_block_output_fn=_skip_block_output_fn_WanTransformerBlock,
            return_hidden_states_index=0,
            return_encoder_hidden_states_index=None,
        ),
    )
 # fmt: off
 def _skip_attention___ret___hidden_states(self, *args, **kwargs):
    hidden_states = kwargs.get("hidden_states", None)
    if hidden_states is None and len(args) > 0:
        hidden_states = args[0]
    return hidden_states
 def _skip_attention___ret___hidden_states___encoder_hidden_states(self, *args, **kwargs):
    hidden_states = kwargs.get("hidden_states", None)
    encoder_hidden_states = kwargs.get("encoder_hidden_states", None)
    if hidden_states is None and len(args) > 0:
        hidden_states = args[0]
    if encoder_hidden_states is None and len(args) > 1:
        encoder_hidden_states = args[1]
    return hidden_states, encoder_hidden_states
 _skip_proc_output_fn_Attention_AttnProcessor2_0 = _skip_attention___ret___hidden_states
 _skip_proc_output_fn_Attention_CogView4AttnProcessor = _skip_attention___ret___hidden_states___encoder_hidden_states
 def _skip_block_output_fn___hidden_states_0___ret___hidden_states(self, *args, **kwargs):
    hidden_states = kwargs.get("hidden_states", None)
    if hidden_states is None and len(args) > 0:
        hidden_states = args[0]
    return hidden_states
 def _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states(self, *args, **kwargs):
    hidden_states = kwargs.get("hidden_states", None)
    encoder_hidden_states = kwargs.get("encoder_hidden_states", None)
    if hidden_states is None and len(args) > 0:
        hidden_states = args[0]
    if encoder_hidden_states is None and len(args) > 1:
        encoder_hidden_states = args[1]
    return hidden_states, encoder_hidden_states
 def _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___encoder_hidden_states___hidden_states(self, *args, **kwargs):
    hidden_states = kwargs.get("hidden_states", None)
    encoder_hidden_states = kwargs.get("encoder_hidden_states", None)
    if hidden_states is None and len(args) > 0:
        hidden_states = args[0]
    if encoder_hidden_states is None and len(args) > 1:
        encoder_hidden_states = args[1]
    return encoder_hidden_states, hidden_states
 _skip_block_output_fn_BasicTransformerBlock = _skip_block_output_fn___hidden_states_0___ret___hidden_states
 _skip_block_output_fn_CogVideoXBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states
 _skip_block_output_fn_CogView4TransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states
 _skip_block_output_fn_FluxTransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___encoder_hidden_states___hidden_states
 _skip_block_output_fn_FluxSingleTransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___encoder_hidden_states___hidden_states
 _skip_block_output_fn_HunyuanVideoTransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states
 _skip_block_output_fn_HunyuanVideoSingleTransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states
 _skip_block_output_fn_HunyuanVideoTokenReplaceTransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states
 _skip_block_output_fn_HunyuanVideoTokenReplaceSingleTransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states
 _skip_block_output_fn_LTXVideoTransformerBlock = _skip_block_output_fn___hidden_states_0___ret___hidden_states
 _skip_block_output_fn_MochiTransformerBlock = _skip_block_output_fn___hidden_states_0___encoder_hidden_states_1___ret___hidden_states___encoder_hidden_states
 _skip_block_output_fn_WanTransformerBlock = _skip_block_output_fn___hidden_states_0___ret___hidden_states
 # fmt: on
 _register_attention_processors_metadata()
 _register_transformer_blocks_metadata()
--- a/src/diffusers/hooks/layer_skip.py
+++ b/src/diffusers/hooks/layer_skip.py
@@ -0,0 +1,231 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from dataclasses import dataclass
 from typing import Callable, List, Optional
 import torch
 from ..utils import get_logger
 from ..utils.torch_utils import unwrap_module
 from ._common import _ALL_TRANSFORMER_BLOCK_IDENTIFIERS, _ATTENTION_CLASSES, _FEEDFORWARD_CLASSES, _get_submodule_from_fqn
 from ._helpers import AttentionProcessorRegistry, TransformerBlockRegistry
 from .hooks import HookRegistry, ModelHook
 logger = get_logger(__name__)  # pylint: disable=invalid-name
 _LAYER_SKIP_HOOK = "layer_skip_hook"
 # Aryan/YiYi TODO: we need to make guider class a config mixin so I think this is not needed
 # either remove or make it serializable
@dataclass
 class LayerSkipConfig:
    r"""
    Configuration for skipping internal transformer blocks when executing a transformer model.
    Args:
        indices (`List[int]`):
            The indices of the layer to skip. This is typically the first layer in the transformer block.
        fqn (`str`, defaults to `"auto"`):
            The fully qualified name identifying the stack of transformer blocks. Typically, this is
            `transformer_blocks`, `single_transformer_blocks`, `blocks`, `layers`, or `temporal_transformer_blocks`.
            For automatic detection, set this to `"auto"`.
            "auto" only works on DiT models. For UNet models, you must provide the correct fqn.
        skip_attention (`bool`, defaults to `True`):
            Whether to skip attention blocks.
        skip_ff (`bool`, defaults to `True`):
            Whether to skip feed-forward blocks.
        skip_attention_scores (`bool`, defaults to `False`):
            Whether to skip attention score computation in the attention blocks. This is equivalent to using `value`
            projections as the output of scaled dot product attention.
        dropout (`float`, defaults to `1.0`):
            The dropout probability for dropping the outputs of the skipped layers. By default, this is set to `1.0`,
            meaning that the outputs of the skipped layers are completely ignored. If set to `0.0`, the outputs of the
            skipped layers are fully retained, which is equivalent to not skipping any layers.
    """
    indices: List[int]
    fqn: str = "auto"
    skip_attention: bool = True
    skip_attention_scores: bool = False
    skip_ff: bool = True
    dropout: float = 1.0
    def __post_init__(self):
        if not (0 <= self.dropout <= 1):
            raise ValueError(f"Expected `dropout` to be between 0.0 and 1.0, but got {self.dropout}.")
        if not math.isclose(self.dropout, 1.0) and self.skip_attention_scores:
            raise ValueError(
                "Cannot set `skip_attention_scores` to True when `dropout` is not 1.0. Please set `dropout` to 1.0."
            )
 class AttentionScoreSkipFunctionMode(torch.overrides.TorchFunctionMode):
    def __torch_function__(self, func, types, args=(), kwargs=None):
        if kwargs is None:
            kwargs = {}
        if func is torch.nn.functional.scaled_dot_product_attention:
            value = kwargs.get("value", None)
            if value is None:
                value = args[2]
            return value
        return func(*args, **kwargs)
 class AttentionProcessorSkipHook(ModelHook):
    def __init__(self, skip_processor_output_fn: Callable, skip_attention_scores: bool = False, dropout: float = 1.0):
        self.skip_processor_output_fn = skip_processor_output_fn
        self.skip_attention_scores = skip_attention_scores
        self.dropout = dropout
    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
        if self.skip_attention_scores:
            if not math.isclose(self.dropout, 1.0):
                raise ValueError(
                    "Cannot set `skip_attention_scores` to True when `dropout` is not 1.0. Please set `dropout` to 1.0."
                )
            with AttentionScoreSkipFunctionMode():
                output = self.fn_ref.original_forward(*args, **kwargs)
        else:
            if math.isclose(self.dropout, 1.0):
                output = self.skip_processor_output_fn(module, *args, **kwargs)
            else:
                output = self.fn_ref.original_forward(*args, **kwargs)
                output = torch.nn.functional.dropout(output, p=self.dropout)
        return output
 class FeedForwardSkipHook(ModelHook):
    def __init__(self, dropout: float):
        super().__init__()
        self.dropout = dropout
    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
        if math.isclose(self.dropout, 1.0):
            output = kwargs.get("hidden_states", None)
            if output is None:
                output = kwargs.get("x", None)
            if output is None and len(args) > 0:
                output = args[0]
        else:
            output = self.fn_ref.original_forward(*args, **kwargs)
            output = torch.nn.functional.dropout(output, p=self.dropout)
        return output
 class TransformerBlockSkipHook(ModelHook):
    def __init__(self, dropout: float):
        super().__init__()
        self.dropout = dropout
    def initialize_hook(self, module):
        self._metadata = TransformerBlockRegistry.get(unwrap_module(module).__class__)
        return module
    def new_forward(self, module: torch.nn.Module, *args, **kwargs):
        if math.isclose(self.dropout, 1.0):
            output = self._metadata.skip_block_output_fn(module, *args, **kwargs)
        else:
            output = self.fn_ref.original_forward(*args, **kwargs)
            output = torch.nn.functional.dropout(output, p=self.dropout)
        return output
 def apply_layer_skip(module: torch.nn.Module, config: LayerSkipConfig) -> None:
    r"""
    Apply layer skipping to internal layers of a transformer.
    Args:
        module (`torch.nn.Module`):
            The transformer model to which the layer skip hook should be applied.
        config (`LayerSkipConfig`):
            The configuration for the layer skip hook.
    Example:
    ```python
    >>> from diffusers import apply_layer_skip_hook, CogVideoXTransformer3DModel, LayerSkipConfig
    >>> transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
    >>> config = LayerSkipConfig(layer_index=[10, 20], fqn="transformer_blocks")
    >>> apply_layer_skip_hook(transformer, config)
    ```
    """
    _apply_layer_skip_hook(module, config)
 def _apply_layer_skip_hook(module: torch.nn.Module, config: LayerSkipConfig, name: Optional[str] = None) -> None:
    name = name or _LAYER_SKIP_HOOK
    if config.skip_attention and config.skip_attention_scores:
        raise ValueError("Cannot set both `skip_attention` and `skip_attention_scores` to True. Please choose one.")
    if not math.isclose(config.dropout, 1.0) and config.skip_attention_scores:
        raise ValueError("Cannot set `skip_attention_scores` to True when `dropout` is not 1.0. Please set `dropout` to 1.0.")
    if config.fqn == "auto":
        for identifier in _ALL_TRANSFORMER_BLOCK_IDENTIFIERS:
            if hasattr(module, identifier):
                config.fqn = identifier
                break
        else:
            raise ValueError(
                "Could not find a suitable identifier for the transformer blocks automatically. Please provide a valid "
                "`fqn` (fully qualified name) that identifies a stack of transformer blocks."
            )
    transformer_blocks = _get_submodule_from_fqn(module, config.fqn)
    if transformer_blocks is None or not isinstance(transformer_blocks, torch.nn.ModuleList):
        raise ValueError(
            f"Could not find {config.fqn} in the provided module, or configured `fqn` (fully qualified name) does not identify "
            f"a `torch.nn.ModuleList`. Please provide a valid `fqn` that identifies a stack of transformer blocks."
        )
    if len(config.indices) == 0:
        raise ValueError("Layer index list is empty. Please provide a non-empty list of layer indices to skip.")
    blocks_found = False
    for i, block in enumerate(transformer_blocks):
        if i not in config.indices:
            continue
        blocks_found = True
        if config.skip_attention and config.skip_ff:
            logger.debug(f"Applying TransformerBlockSkipHook to '{config.fqn}.{i}'")
            registry = HookRegistry.check_if_exists_or_initialize(block)
            hook = TransformerBlockSkipHook(config.dropout)
            registry.register_hook(hook, name)
        elif config.skip_attention or config.skip_attention_scores:
            for submodule_name, submodule in block.named_modules():
                if isinstance(submodule, _ATTENTION_CLASSES) and not submodule.is_cross_attention:
                    logger.debug(f"Applying AttentionProcessorSkipHook to '{config.fqn}.{i}.{submodule_name}'")
                    output_fn = AttentionProcessorRegistry.get(submodule.processor.__class__).skip_processor_output_fn
                    registry = HookRegistry.check_if_exists_or_initialize(submodule)
                    hook = AttentionProcessorSkipHook(output_fn, config.skip_attention_scores, config.dropout)
                    registry.register_hook(hook, name)
        if config.skip_ff:
            for submodule_name, submodule in block.named_modules():
                if isinstance(submodule, _FEEDFORWARD_CLASSES):
                    logger.debug(f"Applying FeedForwardSkipHook to '{config.fqn}.{i}.{submodule_name}'")
                    registry = HookRegistry.check_if_exists_or_initialize(submodule)
                    hook = FeedForwardSkipHook(config.dropout)
                    registry.register_hook(hook, name)
    if not blocks_found:
        raise ValueError(
            f"Could not find any transformer blocks matching the provided indices {config.indices} and "
            f"fully qualified name '{config.fqn}'. Please check the indices and fqn for correctness."
        )
--- a/src/diffusers/hooks/smoothed_energy_guidance_utils.py
+++ b/src/diffusers/hooks/smoothed_energy_guidance_utils.py
@@ -0,0 +1,158 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
 from ..utils import get_logger
 from ._common import _ATTENTION_CLASSES, _get_submodule_from_fqn
 from .hooks import HookRegistry, ModelHook
 logger = get_logger(__name__)  # pylint: disable=invalid-name
 _SMOOTHED_ENERGY_GUIDANCE_HOOK = "smoothed_energy_guidance_hook"
@dataclass
 class SmoothedEnergyGuidanceConfig:
    r"""
    Configuration for skipping internal transformer blocks when executing a transformer model.
    Args:
        indices (`List[int]`):
            The indices of the layer to skip. This is typically the first layer in the transformer block.
        fqn (`str`, defaults to `"auto"`):
            The fully qualified name identifying the stack of transformer blocks. Typically, this is
            `transformer_blocks`, `single_transformer_blocks`, `blocks`, `layers`, or `temporal_transformer_blocks`.
            For automatic detection, set this to `"auto"`.
            "auto" only works on DiT models. For UNet models, you must provide the correct fqn.
        _query_proj_identifiers (`List[str]`, defaults to `None`):
            The identifiers for the query projection layers. Typically, these are `to_q`, `query`, or `q_proj`.
            If `None`, `to_q` is used by default.
    """
    indices: List[int]
    fqn: str = "auto"
    _query_proj_identifiers: List[str] = None
 class SmoothedEnergyGuidanceHook(ModelHook):
    def __init__(self, blur_sigma: float = 1.0, blur_threshold_inf: float = 9999.9) -> None:
        super().__init__()
        self.blur_sigma = blur_sigma
        self.blur_threshold_inf = blur_threshold_inf
    def post_forward(self, module: torch.nn.Module, output: torch.Tensor) -> torch.Tensor:
        # Copied from https://github.com/SusungHong/SEG-SDXL/blob/cf8256d640d5373541cfea3b3b6caf93272cf986/pipeline_seg.py#L172C31-L172C102
        kernel_size = math.ceil(6 * self.blur_sigma) + 1 - math.ceil(6 * self.blur_sigma) % 2
        smoothed_output = _gaussian_blur_2d(output, kernel_size, self.blur_sigma, self.blur_threshold_inf)
        return smoothed_output
 def _apply_smoothed_energy_guidance_hook(module: torch.nn.Module, config: SmoothedEnergyGuidanceConfig, blur_sigma: float, name: Optional[str] = None) -> None:
    name = name or _SMOOTHED_ENERGY_GUIDANCE_HOOK
    if config.fqn == "auto":
        for identifier in _ALL_TRANSFORMER_BLOCK_IDENTIFIERS:
            if hasattr(module, identifier):
                config.fqn = identifier
                break
        else:
            raise ValueError(
                "Could not find a suitable identifier for the transformer blocks automatically. Please provide a valid "
                "`fqn` (fully qualified name) that identifies a stack of transformer blocks."
            )
    if config._query_proj_identifiers is None:
        config._query_proj_identifiers = ["to_q"]
    transformer_blocks = _get_submodule_from_fqn(module, config.fqn)
    blocks_found = False
    for i, block in enumerate(transformer_blocks):
        if i not in config.indices:
            continue
        blocks_found = True
        for submodule_name, submodule in block.named_modules():
            if not isinstance(submodule, _ATTENTION_CLASSES) or submodule.is_cross_attention:
                continue
            for identifier in config._query_proj_identifiers:
                query_proj = getattr(submodule, identifier, None)
                if query_proj is None or not isinstance(query_proj, torch.nn.Linear):
                    continue
                logger.debug(
                    f"Registering smoothed energy guidance hook on {config.fqn}.{i}.{submodule_name}.{identifier}"
                )
                registry = HookRegistry.check_if_exists_or_initialize(query_proj)
                hook = SmoothedEnergyGuidanceHook(blur_sigma)
                registry.register_hook(hook, name)
    if not blocks_found:
        raise ValueError(
            f"Could not find any transformer blocks matching the provided indices {config.indices} and "
            f"fully qualified name '{config.fqn}'. Please check the indices and fqn for correctness."
        )
 # Modified from https://github.com/SusungHong/SEG-SDXL/blob/cf8256d640d5373541cfea3b3b6caf93272cf986/pipeline_seg.py#L71
 def _gaussian_blur_2d(query: torch.Tensor, kernel_size: int, sigma: float, sigma_threshold_inf: float) -> torch.Tensor:
    """
    This implementation assumes that the input query is for visual (image/videos) tokens to apply the 2D gaussian
    blur. However, some models use joint text-visual token attention for which this may not be suitable. Additionally,
    this implementation also assumes that the visual tokens come from a square image/video. In practice, despite
    these assumptions, applying the 2D square gaussian blur on the query projections generates reasonable results
    for Smoothed Energy Guidance.
    SEG is only supported as an experimental prototype feature for now, so the implementation may be modified
    in the future without warning or guarantee of reproducibility.
    """
    assert query.ndim == 3
    is_inf = sigma > sigma_threshold_inf
    batch_size, seq_len, embed_dim = query.shape
    seq_len_sqrt = int(math.sqrt(seq_len))
    num_square_tokens = seq_len_sqrt * seq_len_sqrt
    query_slice = query[:, :num_square_tokens, :]
    query_slice = query_slice.permute(0, 2, 1)
    query_slice = query_slice.reshape(batch_size, embed_dim, seq_len_sqrt, seq_len_sqrt)
    if is_inf:
        kernel_size = min(kernel_size, seq_len_sqrt - (seq_len_sqrt % 2 - 1))
        kernel_size_half = (kernel_size - 1) / 2
        x = torch.linspace(-kernel_size_half, kernel_size_half, steps=kernel_size)
        pdf = torch.exp(-0.5 * (x / sigma).pow(2))
        kernel1d = pdf / pdf.sum()
        kernel1d = kernel1d.to(query)
        kernel2d = torch.matmul(kernel1d[:, None], kernel1d[None, :])
        kernel2d = kernel2d.expand(embed_dim, 1, kernel2d.shape[0], kernel2d.shape[1])
        padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2]
        query_slice = F.pad(query_slice, padding, mode="reflect")
        query_slice = F.conv2d(query_slice, kernel2d, groups=embed_dim)
    else:
        query_slice[:] = query_slice.mean(dim=(-2, -1), keepdim=True)
    query_slice = query_slice.reshape(batch_size, embed_dim, num_square_tokens)
    query_slice = query_slice.permute(0, 2, 1)
    query[:, :num_square_tokens, :] = query_slice.clone()
    return query
--- a/src/diffusers/loaders/init.py
+++ b/src/diffusers/loaders/init.py
@@ -84,6 +84,7 @@ if is_torch_available():
            "IPAdapterMixin",
            "FluxIPAdapterMixin",
            "SD3IPAdapterMixin",
            "ModularIPAdapterMixin",
        ]
 _import_structure["peft"] = ["PeftAdapterMixin"]
@@ -102,6 +103,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
                FluxIPAdapterMixin,
                IPAdapterMixin,
                SD3IPAdapterMixin,
                ModularIPAdapterMixin,
            )
            from .lora_pipeline import (
                AmusedLoraLoaderMixin,
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -356,6 +356,265 @@ class IPAdapterMixin:
            )
        self.unet.set_attn_processor(attn_procs)
 class ModularIPAdapterMixin:
    """Mixin for handling IP Adapters."""
    @validate_hf_hub_args
    def load_ip_adapter(
        self,
        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]],
        subfolder: Union[str, List[str]],
        weight_name: Union[str, List[str]],
        **kwargs,
    ):
        """
        Parameters:
            pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
                Can be either:
                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
                      the Hub.
                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
                      with [`ModelMixin.save_pretrained`].
                    - A [torch state
                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
            subfolder (`str` or `List[str]`):
                The subfolder location of a model file within a larger model repository on the Hub or locally. If a
                list is passed, it should have the same length as `weight_name`.
            weight_name (`str` or `List[str]`):
                The name of the weight file to load. If a list is passed, it should have the same length as
                `subfolder`.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            local_files_only (`bool`, *optional*, defaults to `False`):
                Whether to only load local model weights and configuration files or not. If set to `True`, the model
                won't be downloaded from the Hub.
            token (`str` or *bool*, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
                `diffusers-cli login` (stored in `~/.huggingface`) is used.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                allowed by Git.
            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
                argument to `True` will raise an error.
        """
        # handle the list inputs for multiple IP Adapters
        if not isinstance(weight_name, list):
            weight_name = [weight_name]
        if not isinstance(pretrained_model_name_or_path_or_dict, list):
            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
        if len(pretrained_model_name_or_path_or_dict) == 1:
            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)
        if not isinstance(subfolder, list):
            subfolder = [subfolder]
        if len(subfolder) == 1:
            subfolder = subfolder * len(weight_name)
        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")
        if len(weight_name) != len(subfolder):
            raise ValueError("`weight_name` and `subfolder` must have the same length.")
        # Load the main state dict first.
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)
        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
        if low_cpu_mem_usage and not is_accelerate_available():
            low_cpu_mem_usage = False
            logger.warning(
                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
                " install accelerate\n```\n."
            )
        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
            raise NotImplementedError(
                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
                " `low_cpu_mem_usage=False`."
            )
        user_agent = {
            "file_type": "attn_procs_weights",
            "framework": "pytorch",
        }
        state_dicts = []
        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
            pretrained_model_name_or_path_or_dict, weight_name, subfolder
        ):
            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
                model_file = _get_model_file(
                    pretrained_model_name_or_path_or_dict,
                    weights_name=weight_name,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    local_files_only=local_files_only,
                    token=token,
                    revision=revision,
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
                if weight_name.endswith(".safetensors"):
                    state_dict = {"image_proj": {}, "ip_adapter": {}}
                    with safe_open(model_file, framework="pt", device="cpu") as f:
                        for key in f.keys():
                            if key.startswith("image_proj."):
                                state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
                            elif key.startswith("ip_adapter."):
                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
                else:
                    state_dict = load_state_dict(model_file)
            else:
                state_dict = pretrained_model_name_or_path_or_dict
            keys = list(state_dict.keys())
            if "image_proj" not in keys and "ip_adapter" not in keys:
                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
            state_dicts.append(state_dict)
            # create feature extractor if it has not been registered to the pipeline yet
            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
                # FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
                default_clip_size = 224
                clip_image_size = (
                    self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
                )
                feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
        unet_name = getattr(self, "unet_name", "unet")
        unet = getattr(self, unet_name)
        unet._load_ip_adapter_weights(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
        extra_loras = unet._load_ip_adapter_loras(state_dicts)
        if extra_loras != {}:
            if not USE_PEFT_BACKEND:
                logger.warning("PEFT backend is required to load these weights.")
            else:
                # apply the IP Adapter Face ID LoRA weights
                peft_config = getattr(unet, "peft_config", {})
                for k, lora in extra_loras.items():
                    if f"faceid_{k}" not in peft_config:
                        self.load_lora_weights(lora, adapter_name=f"faceid_{k}")
                        self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0])
    def set_ip_adapter_scale(self, scale):
        """
        Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for
        granular control over each IP-Adapter behavior. A config can be a float or a dictionary.
        Example:
        ```py
        # To use original IP-Adapter
        scale = 1.0
        pipeline.set_ip_adapter_scale(scale)
        # To use style block only
        scale = {
            "up": {"block_0": [0.0, 1.0, 0.0]},
        }
        pipeline.set_ip_adapter_scale(scale)
        # To use style+layout blocks
        scale = {
            "down": {"block_2": [0.0, 1.0]},
            "up": {"block_0": [0.0, 1.0, 0.0]},
        }
        pipeline.set_ip_adapter_scale(scale)
        # To use style and layout from 2 reference images
        scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
        pipeline.set_ip_adapter_scale(scales)
        ```
        """
        unet_name = getattr(self, "unet_name", "unet")
        unet = getattr(self, unet_name)
        if not isinstance(scale, list):
            scale = [scale]
        scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0)
        for attn_name, attn_processor in unet.attn_processors.items():
            if isinstance(
                attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0, IPAdapterXFormersAttnProcessor)
            ):
                if len(scale_configs) != len(attn_processor.scale):
                    raise ValueError(
                        f"Cannot assign {len(scale_configs)} scale_configs to "
                        f"{len(attn_processor.scale)} IP-Adapter."
                    )
                elif len(scale_configs) == 1:
                    scale_configs = scale_configs * len(attn_processor.scale)
                for i, scale_config in enumerate(scale_configs):
                    if isinstance(scale_config, dict):
                        for k, s in scale_config.items():
                            if attn_name.startswith(k):
                                attn_processor.scale[i] = s
                    else:
                        attn_processor.scale[i] = scale_config
    def unload_ip_adapter(self):
        """
        Unloads the IP Adapter weights
        Examples:
        ```python
        >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
        >>> pipeline.unload_ip_adapter()
        >>> ...
        ```
        """
        # remove hidden encoder
        if self.unet is None:
            return
        self.unet.encoder_hid_proj = None
        self.unet.config.encoder_hid_dim_type = None
        # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj`
        if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None:
            self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj
            self.unet.text_encoder_hid_proj = None
            self.unet.config.encoder_hid_dim_type = "text_proj"
        # restore original Unet attention processors layers
        attn_procs = {}
        for name, value in self.unet.attn_processors.items():
            attn_processor_class = (
                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnProcessor()
            )
            attn_procs[name] = (
                attn_processor_class
                if isinstance(
                    value, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0, IPAdapterXFormersAttnProcessor)
                )
                else value.__class__()
            )
        self.unet.set_attn_processor(attn_procs)
 class FluxIPAdapterMixin:
    """Mixin for handling Flux IP Adapters."""
--- a/src/diffusers/loaders/lora_base.py
+++ b/src/diffusers/loaders/lora_base.py
@@ -441,7 +441,7 @@ def _func_optionally_disable_offloading(_pipeline):
    is_model_cpu_offload = False
    is_sequential_cpu_offload = False
-    if _pipeline is not None and _pipeline.hf_device_map is None:
+    if _pipeline is not None and hasattr(_pipeline, "hf_device_map") and _pipeline.hf_device_map is None:
        for _, component in _pipeline.components.items():
            if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
                if not is_model_cpu_offload:
@@ -491,6 +491,7 @@ class LoraBaseMixin:
            tuple:
                A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
        """
        return _func_optionally_disable_offloading(_pipeline=_pipeline)
    @classmethod
@@ -713,8 +714,10 @@ class LoraBaseMixin:
        # Decompose weights into weights for denoiser and text encoders.
        _component_adapter_weights = {}
        for component in self._lora_loadable_modules:
-            model = getattr(self, component)
+            model = getattr(self, component, None)
-
+            if model is None:
                logger.warning(f"Model {component} not found in pipeline.")
                continue
            for adapter_name, weights in zip(adapter_names, adapter_weights):
                if isinstance(weights, dict):
                    component_adapter_weights = weights.pop(component, None)
--- a/src/diffusers/loaders/lora_pipeline.py
+++ b/src/diffusers/loaders/lora_pipeline.py
@@ -636,7 +636,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin):
        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
        state_dict, network_alphas = self.lora_state_dict(
            pretrained_model_name_or_path_or_dict,
-            unet_config=self.unet.config,
+            unet_config=self.unet.config if hasattr(self, "unet") else None,
            **kwargs,
        )
@@ -644,6 +644,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin):
        if not is_correct_format:
            raise ValueError("Invalid LoRA checkpoint.")
        if hasattr(self, "unet"):
            self.load_lora_into_unet(
                state_dict,
                network_alphas=network_alphas,
@@ -653,6 +654,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin):
                low_cpu_mem_usage=low_cpu_mem_usage,
                hotswap=hotswap,
            )
        if hasattr(self, "text_encoder"):
            self.load_lora_into_text_encoder(
                state_dict,
                network_alphas=network_alphas,
@@ -664,6 +666,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin):
                low_cpu_mem_usage=low_cpu_mem_usage,
                hotswap=hotswap,
            )
        if hasattr(self, "text_encoder_2"):
            self.load_lora_into_text_encoder(
                state_dict,
                network_alphas=network_alphas,
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -408,6 +408,7 @@ class UNet2DConditionLoadersMixin:
            tuple:
                A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
        """
        return _func_optionally_disable_offloading(_pipeline=_pipeline)
    def save_attn_procs(
--- a/src/diffusers/modular_pipelines/init.py
+++ b/src/diffusers/modular_pipelines/init.py
@@ -0,0 +1,84 @@
 from typing import TYPE_CHECKING
 from ..utils import (
    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
    is_torch_available,
    is_transformers_available,
 )
 # These modules contain pipelines from multiple libraries/frameworks
 _dummy_objects = {}
 _import_structure = {}
 try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    from ..utils import dummy_pt_objects  # noqa F403
    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
 else:
    _import_structure["modular_pipeline"] = [
        "ModularPipelineBlocks",
        "ModularPipeline",
        "PipelineBlock",
        "AutoPipelineBlocks",
        "SequentialPipelineBlocks",
        "LoopSequentialPipelineBlocks",
        "ModularLoader",
        "PipelineState",
        "BlockState",
    ]
    _import_structure["modular_pipeline_utils"] = [
        "ComponentSpec",
        "ConfigSpec",
        "InputParam",
        "OutputParam",
    ]
    _import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoPipeline", "StableDiffusionXLModularLoader"]
    _import_structure["components_manager"] = ["ComponentsManager"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        from ..utils.dummy_pt_objects import *  # noqa F403
    else:
        from .modular_pipeline import (
            AutoPipelineBlocks,
            BlockState,
            LoopSequentialPipelineBlocks,
            ModularLoader,
            ModularPipelineBlocks,
            ModularPipeline,
            PipelineBlock,
            PipelineState,
            SequentialPipelineBlocks,
        )
        from .modular_pipeline_utils import (
            ComponentSpec,
            ConfigSpec,
            InputParam,
            OutputParam,
        )
        from .stable_diffusion_xl import (
            StableDiffusionXLAutoPipeline,
            StableDiffusionXLModularLoader,
        )
        from .components_manager import ComponentsManager
 else:
    import sys
    sys.modules[__name__] = _LazyModule(
        __name__,
        globals()["__file__"],
        _import_structure,
        module_spec=__spec__,
    )
    for name, value in _dummy_objects.items():
        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/modular_pipelines/components_manager.py
+++ b/src/diffusers/modular_pipelines/components_manager.py
@@ -0,0 +1,934 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import OrderedDict
 from itertools import combinations
 from typing import List, Optional, Union, Dict, Any
 import copy
 import torch
 import time
 from dataclasses import dataclass
 from ..utils import (
    is_accelerate_available,
    logging,
 )
 from ..models.modeling_utils import ModelMixin
 from .modular_pipeline_utils import ComponentSpec
 import uuid
 if is_accelerate_available():
    from accelerate.hooks import ModelHook, add_hook_to_module, remove_hook_from_module
    from accelerate.state import PartialState
    from accelerate.utils import send_to_device
    from accelerate.utils.memory import clear_device_cache
    from accelerate.utils.modeling import convert_file_size_to_int
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # YiYi Notes: copied from modeling_utils.py (decide later where to put this)
 def get_memory_footprint(self, return_buffers=True):
    r"""
    Get the memory footprint of a model. This will return the memory footprint of the current model in bytes. Useful to
    benchmark the memory footprint of the current model and design some tests. Solution inspired from the PyTorch
    discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
    Arguments:
        return_buffers (`bool`, *optional*, defaults to `True`):
            Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers are
            tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch norm
            layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
    """
    mem = sum([param.nelement() * param.element_size() for param in self.parameters()])
    if return_buffers:
        mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()])
        mem = mem + mem_bufs
    return mem
 class CustomOffloadHook(ModelHook):
    """
    A hook that offloads a model on the CPU until its forward pass is called. It ensures the model and its inputs are
    on the given device. Optionally offloads other models to the CPU before the forward pass is called.
    Args:
        execution_device(`str`, `int` or `torch.device`, *optional*):
            The device on which the model should be executed. Will default to the MPS device if it's available, then
            GPU 0 if there is a GPU, and finally to the CPU.
    """
    def __init__(
        self,
        execution_device: Optional[Union[str, int, torch.device]] = None,
        other_hooks: Optional[List["UserCustomOffloadHook"]] = None,
        offload_strategy: Optional["AutoOffloadStrategy"] = None,
    ):
        self.execution_device = execution_device if execution_device is not None else PartialState().default_device
        self.other_hooks = other_hooks
        self.offload_strategy = offload_strategy
        self.model_id = None
    def set_strategy(self, offload_strategy: "AutoOffloadStrategy"):
        self.offload_strategy = offload_strategy
    def add_other_hook(self, hook: "UserCustomOffloadHook"):
        """
        Add a hook to the list of hooks to consider for offloading.
        """
        if self.other_hooks is None:
            self.other_hooks = []
        self.other_hooks.append(hook)
    def init_hook(self, module):
        return module.to("cpu")
    def pre_forward(self, module, *args, **kwargs):
        if module.device != self.execution_device:
            if self.other_hooks is not None:
                hooks_to_offload = [hook for hook in self.other_hooks if hook.model.device == self.execution_device]
                # offload all other hooks
                start_time = time.perf_counter()
                if self.offload_strategy is not None:
                    hooks_to_offload = self.offload_strategy(
                        hooks=hooks_to_offload,
                        model_id=self.model_id,
                        model=module,
                        execution_device=self.execution_device,
                    )
                end_time = time.perf_counter()
                logger.info(
                    f" time taken to apply offload strategy for {self.model_id}: {(end_time - start_time):.2f} seconds"
                )
                for hook in hooks_to_offload:
                    logger.info(
                        f"moving {self.model_id} to {self.execution_device}, offloading {hook.model_id} to cpu"
                    )
                    hook.offload()
                if hooks_to_offload:
                    clear_device_cache()
            module.to(self.execution_device)
        return send_to_device(args, self.execution_device), send_to_device(kwargs, self.execution_device)
 class UserCustomOffloadHook:
    """
    A simple hook grouping a model and a `CustomOffloadHook`, which provides easy APIs for to call the init method of
    the hook or remove it entirely.
    """
    def __init__(self, model_id, model, hook):
        self.model_id = model_id
        self.model = model
        self.hook = hook
    def offload(self):
        self.hook.init_hook(self.model)
    def attach(self):
        add_hook_to_module(self.model, self.hook)
        self.hook.model_id = self.model_id
    def remove(self):
        remove_hook_from_module(self.model)
        self.hook.model_id = None
    def add_other_hook(self, hook: "UserCustomOffloadHook"):
        self.hook.add_other_hook(hook)
 def custom_offload_with_hook(
    model_id: str,
    model: torch.nn.Module,
    execution_device: Union[str, int, torch.device] = None,
    offload_strategy: Optional["AutoOffloadStrategy"] = None,
 ):
    hook = CustomOffloadHook(execution_device=execution_device, offload_strategy=offload_strategy)
    user_hook = UserCustomOffloadHook(model_id=model_id, model=model, hook=hook)
    user_hook.attach()
    return user_hook
 class AutoOffloadStrategy:
    """
    Offload strategy that should be used with `CustomOffloadHook` to automatically offload models to the CPU based on
    the available memory on the device.
    """
    def __init__(self, memory_reserve_margin="3GB"):
        self.memory_reserve_margin = convert_file_size_to_int(memory_reserve_margin)
    def __call__(self, hooks, model_id, model, execution_device):
        if len(hooks) == 0:
            return []
        current_module_size = get_memory_footprint(model)
        mem_on_device = torch.cuda.mem_get_info(execution_device.index)[0]
        mem_on_device = mem_on_device - self.memory_reserve_margin
        if current_module_size < mem_on_device:
            return []
        min_memory_offload = current_module_size - mem_on_device
        logger.info(f" search for models to offload in order to free up {min_memory_offload / 1024**3:.2f} GB memory")
        # exlucde models that's not currently loaded on the device
        module_sizes = dict(
            sorted(
                {hook.model_id: get_memory_footprint(hook.model) for hook in hooks}.items(),
                key=lambda x: x[1],
                reverse=True,
            )
        )
        def search_best_candidate(module_sizes, min_memory_offload):
            """
            search the optimal combination of models to offload to cpu, given a dictionary of module sizes and a
            minimum memory offload size. the combination of models should add up to the smallest modulesize that is
            larger than `min_memory_offload`
            """
            model_ids = list(module_sizes.keys())
            best_candidate = None
            best_size = float("inf")
            for r in range(1, len(model_ids) + 1):
                for candidate_model_ids in combinations(model_ids, r):
                    candidate_size = sum(
                        module_sizes[candidate_model_id] for candidate_model_id in candidate_model_ids
                    )
                    if candidate_size < min_memory_offload:
                        continue
                    else:
                        if best_candidate is None or candidate_size < best_size:
                            best_candidate = candidate_model_ids
                            best_size = candidate_size
            return best_candidate
        best_offload_model_ids = search_best_candidate(module_sizes, min_memory_offload)
        if best_offload_model_ids is None:
            # if no combination is found, meaning that we cannot meet the memory requirement, offload all models
            logger.warning("no combination of models to offload to cpu is found, offloading all models")
            hooks_to_offload = hooks
        else:
            hooks_to_offload = [hook for hook in hooks if hook.model_id in best_offload_model_ids]
        return hooks_to_offload
 class ComponentsManager:
    def __init__(self):
        self.components = OrderedDict()
        self.added_time = OrderedDict()  # Store when components were added 
        self.collections = OrderedDict() # collection_name -> set of component_names
        self.model_hooks = None
        self._auto_offload_enabled = False
    def _lookup_ids(self, name=None, collection=None, load_id=None, components: OrderedDict = None):
        """
        Lookup component_ids by name, collection, or load_id.
        """
        if components is None:
            components = self.components
        if name:
            ids_by_name = set() 
            for component_id, component in components.items():
                comp_name = self._id_to_name(component_id)
                if comp_name == name:
                    ids_by_name.add(component_id)
        else:
            ids_by_name = set(components.keys())
        if collection:
            ids_by_collection = set()
            for component_id, component in components.items():
                if component_id in self.collections[collection]:
                    ids_by_collection.add(component_id)
        else:
            ids_by_collection = set(components.keys())
        if load_id:
            ids_by_load_id = set()
            for name, component in components.items():
                if hasattr(component, "_diffusers_load_id") and component._diffusers_load_id == load_id:
                    ids_by_load_id.add(name)
        else:
            ids_by_load_id = set(components.keys())
        ids = ids_by_name.intersection(ids_by_collection).intersection(ids_by_load_id)
        return ids
    @staticmethod
    def _id_to_name(component_id: str):
        return "_".join(component_id.split("_")[:-1])
    def add(self, name, component, collection: Optional[str] = None):
        component_id = f"{name}_{uuid.uuid4()}"
        # check for duplicated components
        for comp_id, comp in self.components.items():
            if comp == component:
                comp_name = self._id_to_name(comp_id)
                if comp_name == name:
                    logger.warning(
                        f"component '{name}' already exists as '{comp_id}'"
                    )
                    component_id = comp_id
                    break
                else:
                    logger.warning(
                        f"Adding component '{name}' as '{component_id}', but it is duplicate of '{comp_id}'"
                        f"To remove a duplicate, call `components_manager.remove('<component_id>')`."
                        )
        # check for duplicated load_id and warn (we do not delete for you)
        if hasattr(component, "_diffusers_load_id") and component._diffusers_load_id != "null":
            components_with_same_load_id = self._lookup_ids(load_id=component._diffusers_load_id)
            components_with_same_load_id = [id for id in components_with_same_load_id if id != component_id]
            if components_with_same_load_id:
                existing = ", ".join(components_with_same_load_id)
                logger.warning(
                    f"Adding component '{component_id}', but it has duplicate load_id '{component._diffusers_load_id}' with existing components: {existing}. "
                    f"To remove a duplicate, call `components_manager.remove('<component_id>')`."
                )
        # add component to components manager
        self.components[component_id] = component
        self.added_time[component_id] = time.time()
        if collection:
            if collection not in self.collections:
                self.collections[collection] = set()
            if not component_id in self.collections[collection]:
                comp_ids_in_collection = self._lookup_ids(name=name, collection=collection)
                for comp_id in comp_ids_in_collection:
                    logger.info(f"Removing existing {name} from collection '{collection}': {comp_id}")
                    self.remove(comp_id)
                self.collections[collection].add(component_id)
                logger.info(f"Added component '{name}' in collection '{collection}': {component_id}")
        else:
            logger.info(f"Added component '{name}' as '{component_id}'")
        if self._auto_offload_enabled:
            self.enable_auto_cpu_offload(self._auto_offload_device)   
        return component_id
    def remove(self, component_id: str = None):
        if component_id not in self.components:
            logger.warning(f"Component '{component_id}' not found in ComponentsManager")
            return
        component = self.components.pop(component_id)
        self.added_time.pop(component_id)
        for collection in self.collections:
            if component_id in self.collections[collection]:
                self.collections[collection].remove(component_id)
        if self._auto_offload_enabled:
            self.enable_auto_cpu_offload(self._auto_offload_device)
        else:
            if isinstance(component, torch.nn.Module):
                component.to("cpu")
            del component
            import gc
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    def get(self, names: Union[str, List[str]] = None, collection: Optional[str] = None, load_id: Optional[str] = None,
             as_name_component_tuples: bool = False):
        """
        Select components by name with simple pattern matching.
        Args:
            names: Component name(s) or pattern(s)
                Patterns:
                - "unet" : match any component with base name "unet" (e.g., unet_123abc)
                - "!unet" : everything except components with base name "unet"
                - "unet*" : anything with base name starting with "unet"
                - "!unet*" : anything with base name NOT starting with "unet"
                - "*unet*" : anything with base name containing "unet"
                - "!*unet*" : anything with base name NOT containing "unet"
                - "refiner|vae|unet" : anything with base name exactly matching "refiner", "vae", or "unet"
                - "!refiner|vae|unet" : anything with base name NOT exactly matching "refiner", "vae", or "unet"
                - "unet*|vae*" : anything with base name starting with "unet" OR starting with "vae"
            collection: Optional collection to filter by
            load_id: Optional load_id to filter by
            as_name_component_tuples: If True, returns a list of (name, component) tuples using base names
                                     instead of a dictionary with component IDs as keys
        Returns:
            Dictionary mapping component IDs to components, 
            or list of (base_name, component) tuples if as_name_component_tuples=True
        """
        selected_ids = self._lookup_ids(collection=collection, load_id=load_id)
        components = {k: self.components[k] for k in selected_ids}
        # Helper to extract base name from component_id
        def get_base_name(component_id):
            parts = component_id.split('_')
            # If the last part looks like a UUID, remove it
            if len(parts) > 1 and len(parts[-1]) >= 8 and '-' in parts[-1]:
                return '_'.join(parts[:-1])
            return component_id
        if names is None:
            if as_name_component_tuples:
                return [(get_base_name(comp_id), comp) for comp_id, comp in components.items()]
            else:
                return components
        # Create mapping from component_id to base_name for all components
        base_names = {comp_id: get_base_name(comp_id) for comp_id in components.keys()}
        def matches_pattern(component_id, pattern, exact_match=False):
            """
            Helper function to check if a component matches a pattern based on its base name.
            Args:
                component_id: The component ID to check
                pattern: The pattern to match against
                exact_match: If True, only exact matches to base_name are considered
            """
            base_name = base_names[component_id]
            # Exact match with base name
            if exact_match:
                return pattern == base_name
            # Prefix match (ends with *)
            elif pattern.endswith('*'):
                prefix = pattern[:-1]
                return base_name.startswith(prefix)
            # Contains match (starts with *)
            elif pattern.startswith('*'):
                search = pattern[1:-1] if pattern.endswith('*') else pattern[1:]
                return search in base_name
            # Exact match (no wildcards)
            else:
                return pattern == base_name
        if isinstance(names, str):
            # Check if this is a "not" pattern
            is_not_pattern = names.startswith('!')
            if is_not_pattern:
                names = names[1:]  # Remove the ! prefix
            # Handle OR patterns (containing |)
            if '|' in names:
                terms = names.split('|')
                matches = {}
                for comp_id, comp in components.items():
                    # For OR patterns with exact names (no wildcards), we do exact matching on base names
                    exact_match = all(not (term.startswith('*') or term.endswith('*')) for term in terms)
                    # Check if any of the terms match this component
                    should_include = any(matches_pattern(comp_id, term, exact_match) for term in terms)
                    # Flip the decision if this is a NOT pattern
                    if is_not_pattern:
                        should_include = not should_include
                    if should_include:
                        matches[comp_id] = comp
                log_msg = "NOT " if is_not_pattern else ""
                match_type = "exactly matching" if exact_match else "matching any of patterns"
                logger.info(f"Getting components {log_msg}{match_type} {terms}: {list(matches.keys())}")
            # Try exact match with a base name
            elif any(names == base_name for base_name in base_names.values()):
                # Find all components with this base name
                matches = {
                    comp_id: comp for comp_id, comp in components.items() 
                    if (base_names[comp_id] == names) != is_not_pattern
                }
                if is_not_pattern:
                    logger.info(f"Getting all components except those with base name '{names}': {list(matches.keys())}")
                else:
                    logger.info(f"Getting components with base name '{names}': {list(matches.keys())}")
            # Prefix match (ends with *)
            elif names.endswith('*'):
                prefix = names[:-1]
                matches = {
                    comp_id: comp for comp_id, comp in components.items() 
                    if base_names[comp_id].startswith(prefix) != is_not_pattern
                }
                if is_not_pattern:
                    logger.info(f"Getting components NOT starting with '{prefix}': {list(matches.keys())}")
                else:
                    logger.info(f"Getting components starting with '{prefix}': {list(matches.keys())}")
            # Contains match (starts with *)
            elif names.startswith('*'):
                search = names[1:-1] if names.endswith('*') else names[1:]
                matches = {
                    comp_id: comp for comp_id, comp in components.items() 
                    if (search in base_names[comp_id]) != is_not_pattern
                }
                if is_not_pattern:
                    logger.info(f"Getting components NOT containing '{search}': {list(matches.keys())}")
                else:
                    logger.info(f"Getting components containing '{search}': {list(matches.keys())}")
            # Substring match (no wildcards, but not an exact component name)
            elif any(names in base_name for base_name in base_names.values()):
                matches = {
                    comp_id: comp for comp_id, comp in components.items() 
                    if (names in base_names[comp_id]) != is_not_pattern
                }
                if is_not_pattern:
                    logger.info(f"Getting components NOT containing '{names}': {list(matches.keys())}")
                else:
                    logger.info(f"Getting components containing '{names}': {list(matches.keys())}")
            else:
                raise ValueError(f"Component or pattern '{names}' not found in ComponentsManager")
            if not matches:
                raise ValueError(f"No components found matching pattern '{names}'")
            if as_name_component_tuples:
                return [(base_names[comp_id], comp) for comp_id, comp in matches.items()]
            else:
                return matches
        elif isinstance(names, list):
            results = {}
            for name in names:
                result = self.get(name, collection, load_id, as_name_component_tuples=False)
                results.update(result)
            if as_name_component_tuples:
                return [(base_names[comp_id], comp) for comp_id, comp in results.items()]
            else:
                return results
        else:
            raise ValueError(f"Invalid type for names: {type(names)}")
    def enable_auto_cpu_offload(self, device: Union[str, int, torch.device]="cuda", memory_reserve_margin="3GB"):
        for name, component in self.components.items():
            if isinstance(component, torch.nn.Module) and hasattr(component, "_hf_hook"):
                remove_hook_from_module(component, recurse=True)
        self.disable_auto_cpu_offload()
        offload_strategy = AutoOffloadStrategy(memory_reserve_margin=memory_reserve_margin)
        device = torch.device(device)
        if device.index is None:
            device = torch.device(f"{device.type}:{0}")
        all_hooks = []
        for name, component in self.components.items():
            if isinstance(component, torch.nn.Module):
                hook = custom_offload_with_hook(name, component, device, offload_strategy=offload_strategy)
                all_hooks.append(hook)
        for hook in all_hooks:
            other_hooks = [h for h in all_hooks if h is not hook]
            for other_hook in other_hooks:
                if other_hook.hook.execution_device == hook.hook.execution_device:
                    hook.add_other_hook(other_hook)
        self.model_hooks = all_hooks
        self._auto_offload_enabled = True
        self._auto_offload_device = device
    def disable_auto_cpu_offload(self):
        if self.model_hooks is None:
            self._auto_offload_enabled = False
            return
        for hook in self.model_hooks:
            hook.offload()
            hook.remove()
        if self.model_hooks:
            clear_device_cache()
        self.model_hooks = None
        self._auto_offload_enabled = False
    # YiYi TODO: add quantization info
    def get_model_info(self, component_id: str, fields: Optional[Union[str, List[str]]] = None) -> Optional[Dict[str, Any]]:
        """Get comprehensive information about a component.
        Args:
            component_id: Name of the component to get info for
            fields: Optional field(s) to return. Can be a string for single field or list of fields.
                   If None, returns all fields.
        Returns:
            Dictionary containing requested component metadata.
            If fields is specified, returns only those fields.
            If a single field is requested as string, returns just that field's value.
        """
        if component_id not in self.components:
            raise ValueError(f"Component '{component_id}' not found in ComponentsManager")
        component = self.components[component_id]
        # Build complete info dict first
        info = {
            "model_id": component_id,
            "added_time": self.added_time[component_id],
            "collection": ", ".join([coll for coll, comps in self.collections.items() if component_id in comps]) or None,
        }
        # Additional info for torch.nn.Module components
        if isinstance(component, torch.nn.Module):
            # Check for hook information
            has_hook = hasattr(component, "_hf_hook")
            execution_device = None
            if has_hook and hasattr(component._hf_hook, "execution_device"):
                execution_device = component._hf_hook.execution_device
            info.update({
                "class_name": component.__class__.__name__,
                "size_gb": get_memory_footprint(component) / (1024**3),
                "adapters": None,  # Default to None
                "has_hook": has_hook,
                "execution_device": execution_device,
            })
            # Get adapters if applicable
            if hasattr(component, "peft_config"):
                info["adapters"] = list(component.peft_config.keys())
            # Check for IP-Adapter scales
            if hasattr(component, "_load_ip_adapter_weights") and hasattr(component, "attn_processors"):
                processors = copy.deepcopy(component.attn_processors)
                # First check if any processor is an IP-Adapter
                processor_types = [v.__class__.__name__ for v in processors.values()]
                if any("IPAdapter" in ptype for ptype in processor_types):
                    # Then get scales only from IP-Adapter processors
                    scales = {
                        k: v.scale 
                        for k, v in processors.items() 
                        if hasattr(v, "scale") and "IPAdapter" in v.__class__.__name__
                    }
                    if scales:
                        info["ip_adapter"] = summarize_dict_by_value_and_parts(scales)
        # If fields specified, filter info
        if fields is not None:
            if isinstance(fields, str):
                # Single field requested, return just that value
                return {fields: info.get(fields)}
            else:
                # List of fields requested, return dict with just those fields
                return {k: v for k, v in info.items() if k in fields}
        return info
    def __repr__(self):
        # Helper to get simple name without UUID
        def get_simple_name(name):
            # Extract the base name by splitting on underscore and taking first part
            # This assumes names are in format "name_uuid"
            parts = name.split('_')
            # If we have at least 2 parts and the last part looks like a UUID, remove it
            if len(parts) > 1 and len(parts[-1]) >= 8 and '-' in parts[-1]:
                return '_'.join(parts[:-1])
            return name
        # Extract load_id if available
        def get_load_id(component):
            if hasattr(component, "_diffusers_load_id"):
                return component._diffusers_load_id
            return "N/A"
        # Format device info compactly
        def format_device(component, info):
            if not info["has_hook"]:
                return str(getattr(component, 'device', 'N/A'))
            else:
                device = str(getattr(component, 'device', 'N/A'))
                exec_device = str(info['execution_device'] or 'N/A')
                return f"{device}({exec_device})"
        # Get all simple names to calculate width
        simple_names = [get_simple_name(id) for id in self.components.keys()]
        # Get max length of load_ids for models
        load_ids = [
            get_load_id(component) 
            for component in self.components.values() 
            if isinstance(component, torch.nn.Module) and hasattr(component, "_diffusers_load_id")
        ]
        max_load_id_len = max([15] + [len(str(lid)) for lid in load_ids]) if load_ids else 15
        # Get all collections for each component
        component_collections = {}
        for name in self.components.keys():
            component_collections[name] = []
            for coll, comps in self.collections.items():
                if name in comps:
                    component_collections[name].append(coll)
            if not component_collections[name]:
                component_collections[name] = ["N/A"]
        # Find the maximum collection name length
        all_collections = [coll for colls in component_collections.values() for coll in colls]
        max_collection_len = max(10, max(len(str(c)) for c in all_collections)) if all_collections else 10
        col_widths = {
            "name": max(15, max(len(name) for name in simple_names)),
            "class": max(25, max(len(component.__class__.__name__) for component in self.components.values())),
            "device": 15,  # Reduced since using more compact format
            "dtype": 15,
            "size": 10,
            "load_id": max_load_id_len,
            "collection": max_collection_len
        }
        # Create the header lines
        sep_line = "=" * (sum(col_widths.values()) + len(col_widths) * 3 - 1) + "\n"
        dash_line = "-" * (sum(col_widths.values()) + len(col_widths) * 3 - 1) + "\n"
        output = "Components:\n" + sep_line
        # Separate components into models and others
        models = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
        others = {k: v for k, v in self.components.items() if not isinstance(v, torch.nn.Module)}
        # Models section
        if models:
            output += "Models:\n" + dash_line
            # Column headers
            output += f"{'Name':<{col_widths['name']}} | {'Class':<{col_widths['class']}} | "
            output += f"{'Device':<{col_widths['device']}} | {'Dtype':<{col_widths['dtype']}} | "
            output += f"{'Size (GB)':<{col_widths['size']}} | {'Load ID':<{col_widths['load_id']}} | Collection\n"
            output += dash_line
            # Model entries
            for name, component in models.items():
                info = self.get_model_info(name)
                simple_name = get_simple_name(name)
                device_str = format_device(component, info)
                dtype = str(component.dtype) if hasattr(component, "dtype") else "N/A"
                load_id = get_load_id(component)
                # Print first collection on the main line
                first_collection = component_collections[name][0] if component_collections[name] else "N/A"
                output += f"{simple_name:<{col_widths['name']}} | {info['class_name']:<{col_widths['class']}} | "
                output += f"{device_str:<{col_widths['device']}} | {dtype:<{col_widths['dtype']}} | "
                output += f"{info['size_gb']:<{col_widths['size']}.2f} | {load_id:<{col_widths['load_id']}} | {first_collection}\n"
                # Print additional collections on separate lines if they exist
                for i in range(1, len(component_collections[name])):
                    collection = component_collections[name][i]
                    output += f"{'':<{col_widths['name']}} | {'':<{col_widths['class']}} | "
                    output += f"{'':<{col_widths['device']}} | {'':<{col_widths['dtype']}} | "
                    output += f"{'':<{col_widths['size']}} | {'':<{col_widths['load_id']}} | {collection}\n"
            output += dash_line
        # Other components section
        if others:
            if models:  # Add extra newline if we had models section
                output += "\n"
            output += "Other Components:\n" + dash_line
            # Column headers for other components
            output += f"{'Name':<{col_widths['name']}} | {'Class':<{col_widths['class']}} | Collection\n"
            output += dash_line
            # Other component entries
            for name, component in others.items():
                info = self.get_model_info(name)
                simple_name = get_simple_name(name)
                # Print first collection on the main line
                first_collection = component_collections[name][0] if component_collections[name] else "N/A"
                output += f"{simple_name:<{col_widths['name']}} | {component.__class__.__name__:<{col_widths['class']}} | {first_collection}\n"
                # Print additional collections on separate lines if they exist
                for i in range(1, len(component_collections[name])):
                    collection = component_collections[name][i]
                    output += f"{'':<{col_widths['name']}} | {'':<{col_widths['class']}} | {collection}\n"
            output += dash_line
        # Add additional component info
        output += "\nAdditional Component Info:\n" + "=" * 50 + "\n"
        for name in self.components:
            info = self.get_model_info(name)
            if info is not None and (info.get("adapters") is not None or info.get("ip_adapter")):
                simple_name = get_simple_name(name)
                output += f"\n{simple_name}:\n"
                if info.get("adapters") is not None:
                    output += f"  Adapters: {info['adapters']}\n"
                if info.get("ip_adapter"):
                    output += f"  IP-Adapter: Enabled\n"
                output += f"  Added Time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['added_time']))}\n"
        return output
    def from_pretrained(self, pretrained_model_name_or_path, prefix: Optional[str] = None, **kwargs):
        """
        Load components from a pretrained model and add them to the manager.
        Args:
            pretrained_model_name_or_path (str): The path or identifier of the pretrained model
            prefix (str, optional): Prefix to add to all component names loaded from this model.
                                  If provided, components will be named as "{prefix}_{component_name}"
            **kwargs: Additional arguments to pass to DiffusionPipeline.from_pretrained()
        """
        subfolder = kwargs.pop("subfolder", None)
        # YiYi TODO: extend AutoModel to support non-diffusers models
        if subfolder:
            from ..models import AutoModel
            component = AutoModel.from_pretrained(pretrained_model_name_or_path, subfolder=subfolder, **kwargs)
            component_name = f"{prefix}_{subfolder}" if prefix else subfolder
            if component_name not in self.components:
                self.add(component_name, component)
            else:
                logger.warning(
                    f"Component '{component_name}' already exists in ComponentsManager and will not be added. To add it, either:\n"
                    f"1. remove the existing component with remove('{component_name}')\n"
                    f"2. Use a different prefix: add_from_pretrained(..., prefix='{prefix}_2')"
                )
        else:
            from ..pipelines.pipeline_utils import DiffusionPipeline
            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, **kwargs)
            for name, component in pipe.components.items():
                if component is None:
                    continue
                # Add prefix if specified
                component_name = f"{prefix}_{name}" if prefix else name
                if component_name not in self.components:
                    self.add(component_name, component)
                else:
                    logger.warning(
                        f"Component '{component_name}' already exists in ComponentsManager and will not be added. To add it, either:\n"
                        f"1. remove the existing component with remove('{component_name}')\n"
                        f"2. Use a different prefix: add_from_pretrained(..., prefix='{prefix}_2')"
                    )
    def get_one(self, component_id: Optional[str] = None, name: Optional[str] = None, collection: Optional[str] = None, load_id: Optional[str] = None) -> Any:
        """
        Get a single component by name. Raises an error if multiple components match or none are found.
        Args:
            name: Component name or pattern
            collection: Optional collection to filter by
            load_id: Optional load_id to filter by
        Returns:
            A single component
        Raises:
            ValueError: If no components match or multiple components match
        """
        # if component_id is provided, return the component
        if component_id is not None and (name is not None or collection is not None or load_id is not None):
            raise ValueError(" if component_id is provided, name, collection, and load_id must be None")
        elif component_id is not None:
            if component_id not in self.components:
                raise ValueError(f"Component '{component_id}' not found in ComponentsManager")
            return self.components[component_id]
        results = self.get(name, collection, load_id)
        if not results:
            raise ValueError(f"No components found matching '{name}'")
        if len(results) > 1:
            raise ValueError(f"Multiple components found matching '{name}': {list(results.keys())}")
        return next(iter(results.values()))
 def summarize_dict_by_value_and_parts(d: Dict[str, Any]) -> Dict[str, Any]:
    """Summarizes a dictionary by finding common prefixes that share the same value.
    For a dictionary with dot-separated keys like:
    {
        'down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor': [0.6],
        'down_blocks.1.attentions.1.transformer_blocks.1.attn2.processor': [0.6],
        'up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor': [0.3],
    }
    Returns a dictionary where keys are the shortest common prefixes and values are their shared values:
    {
        'down_blocks': [0.6],
        'up_blocks': [0.3]
    }
    """
    # First group by values - convert lists to tuples to make them hashable
    value_to_keys = {}
    for key, value in d.items():
        value_tuple = tuple(value) if isinstance(value, list) else value
        if value_tuple not in value_to_keys:
            value_to_keys[value_tuple] = []
        value_to_keys[value_tuple].append(key)
    def find_common_prefix(keys: List[str]) -> str:
        """Find the shortest common prefix among a list of dot-separated keys."""
        if not keys:
            return ""
        if len(keys) == 1:
            return keys[0]
        # Split all keys into parts
        key_parts = [k.split('.') for k in keys]
        # Find how many initial parts are common
        common_length = 0
        for parts in zip(*key_parts):
            if len(set(parts)) == 1:  # All parts at this position are the same
                common_length += 1
            else:
                break
        if common_length == 0:
            return ""
        # Return the common prefix
        return '.'.join(key_parts[0][:common_length])
    # Create summary by finding common prefixes for each value group
    summary = {}
    for value_tuple, keys in value_to_keys.items():
        prefix = find_common_prefix(keys)
        if prefix:  # Only add if we found a common prefix
            # Convert tuple back to list if it was originally a list
            value = list(value_tuple) if isinstance(d[keys[0]], list) else value_tuple
            summary[prefix] = value
        else:
            summary[""] = value  # Use empty string if no common prefix
    return summary
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -0,0 +1,616 @@
 # Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
 import inspect
 from dataclasses import dataclass, asdict, field, fields
 from typing import Any, Dict, List, Optional, Tuple, Type, Union, Literal
 from ..utils.import_utils import is_torch_available
 from ..configuration_utils import FrozenDict, ConfigMixin
 from collections import OrderedDict
 if is_torch_available():
    import torch
 class InsertableOrderedDict(OrderedDict):
    def insert(self, key, value, index):
        items = list(self.items())
        # Remove key if it already exists to avoid duplicates
        items = [(k, v) for k, v in items if k != key]
        # Insert at the specified index
        items.insert(index, (key, value))
        # Clear and update self
        self.clear()
        self.update(items)
        # Return self for method chaining
        return self
 # YiYi TODO:
 # 1. validate the dataclass fields
 # 2. add a validator for create_* methods, make sure they are valid inputs to pass to from_pretrained()
@dataclass
 class ComponentSpec:
    """Specification for a pipeline component.
    A component can be created in two ways:
    1. From scratch using __init__ with a config dict
    2. using `from_pretrained`
    Attributes:
        name: Name of the component
        type_hint: Type of the component (e.g. UNet2DConditionModel)
        description: Optional description of the component
        config: Optional config dict for __init__ creation
        repo: Optional repo path for from_pretrained creation
        subfolder: Optional subfolder in repo
        variant: Optional variant in repo
        revision: Optional revision in repo
        default_creation_method: Preferred creation method - "from_config" or "from_pretrained"
    """
    name: Optional[str] = None
    type_hint: Optional[Type] = None
    description: Optional[str] = None
    config: Optional[FrozenDict[str, Any]] = None
    # YiYi Notes: should we change it to pretrained_model_name_or_path for consistency? a bit long for a field name
    repo: Optional[Union[str, List[str]]] = field(default=None, metadata={"loading": True})
    subfolder: Optional[str] = field(default=None, metadata={"loading": True})
    variant: Optional[str] = field(default=None, metadata={"loading": True})
    revision: Optional[str] = field(default=None, metadata={"loading": True})
    default_creation_method: Literal["from_config", "from_pretrained"] = "from_pretrained"
    def __hash__(self):
        """Make ComponentSpec hashable, using load_id as the hash value."""
        return hash((self.name, self.load_id, self.default_creation_method))
    def __eq__(self, other):
        """Compare ComponentSpec objects based on name and load_id."""
        if not isinstance(other, ComponentSpec):
            return False
        return (self.name == other.name and 
                self.load_id == other.load_id and 
                self.default_creation_method == other.default_creation_method)
    @classmethod
    def from_component(cls, name: str, component: Any) -> Any:
        """Create a ComponentSpec from a Component created by `create` or `load` method."""
        if not hasattr(component, "_diffusers_load_id"):
            raise ValueError("Component is not created by `create` or `load` method")
        # throw a error if component is created with `create` method but not a subclass of ConfigMixin
        # YiYi TODO: remove this check if we remove support for non configmixin in `create()` method
        if component._diffusers_load_id == "null" and not isinstance(component, ConfigMixin):
            raise ValueError(
                "We currently only support creating ComponentSpec from a component with "
                "created with `ComponentSpec.load` method"
                "or created with `ComponentSpec.create` and a subclass of ConfigMixin"
            )
        type_hint = component.__class__
        default_creation_method = "from_config" if component._diffusers_load_id == "null" else "from_pretrained"
        if isinstance(component, ConfigMixin):
            config = component.config
        else:
            config = None
        load_spec = cls.decode_load_id(component._diffusers_load_id)
        return cls(name=name, type_hint=type_hint, config=config, default_creation_method=default_creation_method, **load_spec)
    @classmethod
    def loading_fields(cls) -> List[str]:
        """
        Return the names of all loading‐related fields
        (i.e. those whose field.metadata["loading"] is True).
        """
        return [f.name for f in fields(cls) if f.metadata.get("loading", False)]
    @property
    def load_id(self) -> str:
        """
        Unique identifier for this spec's pretrained load,
        composed of repo|subfolder|variant|revision (no empty segments).
        """
        parts = [getattr(self, k) for k in self.loading_fields()]
        parts = ["null" if p is None else p for p in parts]
        return "|".join(p for p in parts if p)
    @classmethod
    def decode_load_id(cls, load_id: str) -> Dict[str, Optional[str]]:
        """
        Decode a load_id string back into a dictionary of loading fields and values.
        Args:
            load_id: The load_id string to decode, format: "repo|subfolder|variant|revision"
                     where None values are represented as "null"
        Returns:
            Dict mapping loading field names to their values. e.g.
            {
                "repo": "path/to/repo",
                "subfolder": "subfolder",
                "variant": "variant",
                "revision": "revision"
            }
            If a segment value is "null", it's replaced with None.
            Returns None if load_id is "null" (indicating component not created with `load` method).
        """
        # Get all loading fields in order
        loading_fields = cls.loading_fields()
        result = {f: None for f in loading_fields}
        if load_id == "null":
            return result
        # Split the load_id
        parts = load_id.split("|")
        # Map parts to loading fields by position
        for i, part in enumerate(parts):
            if i < len(loading_fields):
                # Convert "null" string back to None
                result[loading_fields[i]] = None if part == "null" else part
        return result
    # YiYi TODO: I think we should only support ConfigMixin for this method (after we make guider and image_processors config mixin)
    # otherwise we cannot do spec -> spec.create() -> component -> ComponentSpec.from_component(component)
    # the config info is lost in the process
    # remove error check in from_component spec and ModularLoader.update() if we remove support for non configmixin in `create()` method
    def create(self, config: Optional[Union[FrozenDict, Dict[str, Any]]] = None, **kwargs) -> Any:
        """Create component using from_config with config."""
        if self.type_hint is None or not isinstance(self.type_hint, type):
            raise ValueError(
                f"`type_hint` is required when using from_config creation method."
            )
        config = config or self.config or {}
        if issubclass(self.type_hint, ConfigMixin):
            component = self.type_hint.from_config(config, **kwargs)
        else:
            signature_params = inspect.signature(self.type_hint.__init__).parameters
            init_kwargs = {}
            for k, v in config.items():
                if k in signature_params:
                    init_kwargs[k] = v
            for k, v in kwargs.items():
                if k in signature_params:
                    init_kwargs[k] = v
            component = self.type_hint(**init_kwargs)
        component._diffusers_load_id = "null"
        if hasattr(component, "config"):
            self.config = component.config
        return component
    # YiYi TODO: add guard for type of model, if it is supported by from_pretrained
    def load(self, **kwargs) -> Any:
        """Load component using from_pretrained."""
        # select loading fields from kwargs passed from user: e.g. repo, subfolder, variant, revision, note the list could change
        passed_loading_kwargs = {key: kwargs.pop(key) for key in self.loading_fields() if key in kwargs}
        # merge loading field value in the spec with user passed values to create load_kwargs
        load_kwargs = {key: passed_loading_kwargs.get(key, getattr(self, key)) for key in self.loading_fields()}
        # repo is a required argument for from_pretrained, a.k.a. pretrained_model_name_or_path
        repo = load_kwargs.pop("repo", None)
        if repo is None:
            raise ValueError(f"`repo` info is required when using `load` method (you can directly set it in `repo` field of the ComponentSpec or pass it as an argument)")
        if self.type_hint is None:
            try:
                from diffusers import AutoModel
                component = AutoModel.from_pretrained(repo, **load_kwargs, **kwargs)
            except Exception as e:
                raise ValueError(f"Unable to load {self.name} without `type_hint`: {e}")
            # update type_hint if AutoModel load successfully
            self.type_hint = component.__class__
        else:
            try:
                component = self.type_hint.from_pretrained(repo, **load_kwargs, **kwargs)
            except Exception as e:
                raise ValueError(f"Unable to load {self.name} using load method: {e}")
        self.repo = repo
        for k, v in load_kwargs.items():
            setattr(self, k, v)
        component._diffusers_load_id = self.load_id
        return component
@dataclass 
 class ConfigSpec:
    """Specification for a pipeline configuration parameter."""
    name: str
    default: Any
    description: Optional[str] = None
 # YiYi Notes: both inputs and intermediates_inputs are InputParam objects
 # however some fields are not relevant for intermediates_inputs
 # e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
 # default is not used for intermediates_inputs, we only use default from inputs, so it is ignored if it is set for intermediates_inputs
 # -> should we use different class for inputs and intermediates_inputs?
@dataclass
 class InputParam:
    """Specification for an input parameter."""
    name: str = None
    type_hint: Any = None
    default: Any = None
    required: bool = False
    description: str = ""
    kwargs_type: str = None # YiYi Notes: remove this feature (maybe)
    def __repr__(self):
        return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
@dataclass 
 class OutputParam:
    """Specification for an output parameter."""
    name: str
    type_hint: Any = None
    description: str = ""
    kwargs_type: str = None # YiYi notes: remove this feature (maybe)
    def __repr__(self):
        return f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
 def format_inputs_short(inputs):
    """
    Format input parameters into a string representation, with required params first followed by optional ones.
    Args:
        inputs: List of input parameters with 'required' and 'name' attributes, and 'default' for optional params
    Returns:
        str: Formatted string of input parameters
    Example:
        >>> inputs = [
        ...     InputParam(name="prompt", required=True),
        ...     InputParam(name="image", required=True),
        ...     InputParam(name="guidance_scale", required=False, default=7.5),
        ...     InputParam(name="num_inference_steps", required=False, default=50)
        ... ]
        >>> format_inputs_short(inputs)
        'prompt, image, guidance_scale=7.5, num_inference_steps=50'
    """
    required_inputs = [param for param in inputs if param.required]
    optional_inputs = [param for param in inputs if not param.required]
    required_str = ", ".join(param.name for param in required_inputs)
    optional_str = ", ".join(f"{param.name}={param.default}" for param in optional_inputs)
    inputs_str = required_str
    if optional_str:
        inputs_str = f"{inputs_str}, {optional_str}" if required_str else optional_str
    return inputs_str
 def format_intermediates_short(intermediates_inputs, required_intermediates_inputs, intermediates_outputs):
    """
    Formats intermediate inputs and outputs of a block into a string representation.
    Args:
        intermediates_inputs: List of intermediate input parameters
        required_intermediates_inputs: List of required intermediate input names
        intermediates_outputs: List of intermediate output parameters
    Returns:
        str: Formatted string like:
            Intermediates:
                - inputs: Required(latents), dtype
                - modified: latents  # variables that appear in both inputs and outputs
                - outputs: images    # new outputs only
    """
    # Handle inputs
    input_parts = []
    for inp in intermediates_inputs:
        if inp.name in required_intermediates_inputs:
            input_parts.append(f"Required({inp.name})")
        else:
            if inp.name is None and inp.kwargs_type is not None:
                inp_name = "*_" + inp.kwargs_type
            else:
                inp_name = inp.name
            input_parts.append(inp_name)
    # Handle modified variables (appear in both inputs and outputs)
    inputs_set = {inp.name for inp in intermediates_inputs}
    modified_parts = []
    new_output_parts = []
    for out in intermediates_outputs:
        if out.name in inputs_set:
            modified_parts.append(out.name)
        else:
            new_output_parts.append(out.name)
    result = []
    if input_parts:
        result.append(f"    - inputs: {', '.join(input_parts)}")
    if modified_parts:
        result.append(f"    - modified: {', '.join(modified_parts)}")
    if new_output_parts:
        result.append(f"    - outputs: {', '.join(new_output_parts)}")
    return "\n".join(result) if result else "    (none)"
 def format_params(params, header="Args", indent_level=4, max_line_length=115):
    """Format a list of InputParam or OutputParam objects into a readable string representation.
    Args:
        params: List of InputParam or OutputParam objects to format
        header: Header text to use (e.g. "Args" or "Returns")
        indent_level: Number of spaces to indent each parameter line (default: 4)
        max_line_length: Maximum length for each line before wrapping (default: 115)
    Returns:
        A formatted string representing all parameters
    """
    if not params:
        return ""
    base_indent = " " * indent_level
    param_indent = " " * (indent_level + 4)
    desc_indent = " " * (indent_level + 8)
    formatted_params = []
    def get_type_str(type_hint):
        if hasattr(type_hint, "__origin__") and type_hint.__origin__ is Union:
            types = [t.__name__ if hasattr(t, "__name__") else str(t) for t in type_hint.__args__]
            return f"Union[{', '.join(types)}]"
        return type_hint.__name__ if hasattr(type_hint, "__name__") else str(type_hint)
    def wrap_text(text, indent, max_length):
        """Wrap text while preserving markdown links and maintaining indentation."""
        words = text.split()
        lines = []
        current_line = []
        current_length = 0
        for word in words:
            word_length = len(word) + (1 if current_line else 0)
            if current_line and current_length + word_length > max_length:
                lines.append(" ".join(current_line))
                current_line = [word]
                current_length = len(word)
            else:
                current_line.append(word)
                current_length += word_length
        if current_line:
            lines.append(" ".join(current_line))
        return f"\n{indent}".join(lines)
    # Add the header
    formatted_params.append(f"{base_indent}{header}:")
    for param in params:
        # Format parameter name and type
        type_str = get_type_str(param.type_hint) if param.type_hint != Any else ""
        # YiYi Notes: remove this line if we remove kwargs_type
        name = f'**{param.kwargs_type}' if param.name is None and param.kwargs_type is not None else param.name
        param_str = f"{param_indent}{name} (`{type_str}`"
        # Add optional tag and default value if parameter is an InputParam and optional
        if hasattr(param, "required"):
            if not param.required:
                param_str += ", *optional*"
                if param.default is not None:
                    param_str += f", defaults to {param.default}"
        param_str += "):"
        # Add description on a new line with additional indentation and wrapping
        if param.description:
            desc = re.sub(
                r'\[(.*?)\]\((https?://[^\s\)]+)\)',
                r'[\1](\2)',
                param.description
            )
            wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
            param_str += f"\n{desc_indent}{wrapped_desc}"
        formatted_params.append(param_str)
    return "\n\n".join(formatted_params)
 def format_input_params(input_params, indent_level=4, max_line_length=115):
    """Format a list of InputParam objects into a readable string representation.
    Args:
        input_params: List of InputParam objects to format
        indent_level: Number of spaces to indent each parameter line (default: 4)
        max_line_length: Maximum length for each line before wrapping (default: 115)
    Returns:
        A formatted string representing all input parameters
    """
    return format_params(input_params, "Inputs", indent_level, max_line_length)
 def format_output_params(output_params, indent_level=4, max_line_length=115):
    """Format a list of OutputParam objects into a readable string representation.
    Args:
        output_params: List of OutputParam objects to format
        indent_level: Number of spaces to indent each parameter line (default: 4)
        max_line_length: Maximum length for each line before wrapping (default: 115)
    Returns:
        A formatted string representing all output parameters
    """
    return format_params(output_params, "Outputs", indent_level, max_line_length)
 def format_components(components, indent_level=4, max_line_length=115, add_empty_lines=True):
    """Format a list of ComponentSpec objects into a readable string representation.
    Args:
        components: List of ComponentSpec objects to format
        indent_level: Number of spaces to indent each component line (default: 4)
        max_line_length: Maximum length for each line before wrapping (default: 115)
        add_empty_lines: Whether to add empty lines between components (default: True)
    Returns:
        A formatted string representing all components
    """
    if not components:
        return ""
    base_indent = " " * indent_level
    component_indent = " " * (indent_level + 4)
    formatted_components = []
    # Add the header
    formatted_components.append(f"{base_indent}Components:")
    if add_empty_lines:
        formatted_components.append("")
    # Add each component with optional empty lines between them
    for i, component in enumerate(components):
        # Get type name, handling special cases
        type_name = component.type_hint.__name__ if hasattr(component.type_hint, "__name__") else str(component.type_hint)
        component_desc = f"{component_indent}{component.name} (`{type_name}`)"
        if component.description:
            component_desc += f": {component.description}"
        # Get the loading fields dynamically
        loading_field_values = []
        for field_name in component.loading_fields():
            field_value = getattr(component, field_name)
            if field_value is not None:
                loading_field_values.append(f"{field_name}={field_value}")
        # Add loading field information if available
        if loading_field_values:
            component_desc += f" [{', '.join(loading_field_values)}]"
        formatted_components.append(component_desc)
        # Add an empty line after each component except the last one
        if add_empty_lines and i < len(components) - 1:
            formatted_components.append("")
    return "\n".join(formatted_components)
 def format_configs(configs, indent_level=4, max_line_length=115, add_empty_lines=True):
    """Format a list of ConfigSpec objects into a readable string representation.
    Args:
        configs: List of ConfigSpec objects to format
        indent_level: Number of spaces to indent each config line (default: 4)
        max_line_length: Maximum length for each line before wrapping (default: 115)
        add_empty_lines: Whether to add empty lines between configs (default: True)
    Returns:
        A formatted string representing all configs
    """
    if not configs:
        return ""
    base_indent = " " * indent_level
    config_indent = " " * (indent_level + 4)
    formatted_configs = []
    # Add the header
    formatted_configs.append(f"{base_indent}Configs:")
    if add_empty_lines:
        formatted_configs.append("")
    # Add each config with optional empty lines between them
    for i, config in enumerate(configs):
        config_desc = f"{config_indent}{config.name} (default: {config.default})"
        if config.description:
            config_desc += f": {config.description}"
        formatted_configs.append(config_desc)
        # Add an empty line after each config except the last one
        if add_empty_lines and i < len(configs) - 1:
            formatted_configs.append("")
    return "\n".join(formatted_configs)
 def make_doc_string(inputs, intermediates_inputs, outputs, description="", class_name=None, expected_components=None, expected_configs=None):
    """
    Generates a formatted documentation string describing the pipeline block's parameters and structure.
    Args:
        inputs: List of input parameters
        intermediates_inputs: List of intermediate input parameters
        outputs: List of output parameters
        description (str, *optional*): Description of the block
        class_name (str, *optional*): Name of the class to include in the documentation
        expected_components (List[ComponentSpec], *optional*): List of expected components
        expected_configs (List[ConfigSpec], *optional*): List of expected configurations
    Returns:
        str: A formatted string containing information about components, configs, call parameters, 
            intermediate inputs/outputs, and final outputs.
    """
    output = ""
    # Add class name if provided
    if class_name:
        output += f"class {class_name}\n\n"
    # Add description
    if description:
        desc_lines = description.strip().split('\n')
        aligned_desc = '\n'.join('  ' + line for line in desc_lines)
        output += aligned_desc + "\n\n"
    # Add components section if provided
    if expected_components and len(expected_components) > 0:
        components_str = format_components(expected_components, indent_level=2)
        output += components_str + "\n\n"
    # Add configs section if provided
    if expected_configs and len(expected_configs) > 0:
        configs_str = format_configs(expected_configs, indent_level=2)
        output += configs_str + "\n\n"
    # Add inputs section
    output += format_input_params(inputs + intermediates_inputs, indent_level=2)
    # Add outputs section
    output += "\n\n"
    output += format_output_params(outputs, indent_level=2)
    return output 
--- a/src/diffusers/modular_pipelines/node_utils.py
+++ b/src/diffusers/modular_pipelines/node_utils.py
@@ -0,0 +1,519 @@
 from ..configuration_utils import ConfigMixin
 from .modular_pipeline import SequentialPipelineBlocks, ModularPipelineBlocks
 from .modular_pipeline_utils import InputParam, OutputParam
 from ..image_processor import PipelineImageInput
 from pathlib import Path
 import json
 import os
 from typing import Union, List, Optional, Tuple
 import torch
 import PIL
 import numpy as np
 import logging
 logger = logging.getLogger(__name__)
 # YiYi Notes: this is actually for SDXL, put it here for now
 SDXL_INPUTS_SCHEMA = {
    "prompt": InputParam("prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"),
    "prompt_2": InputParam("prompt_2", type_hint=Union[str, List[str]], description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2"),
    "negative_prompt": InputParam("negative_prompt", type_hint=Union[str, List[str]], description="The prompt or prompts not to guide the image generation"),
    "negative_prompt_2": InputParam("negative_prompt_2", type_hint=Union[str, List[str]], description="The negative prompt or prompts for text_encoder_2"),
    "cross_attention_kwargs": InputParam("cross_attention_kwargs", type_hint=Optional[dict], description="Kwargs dictionary passed to the AttentionProcessor"),
    "clip_skip": InputParam("clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"),
    "image": InputParam("image", type_hint=PipelineImageInput, required=True, description="The image(s) to modify for img2img or inpainting"),
    "mask_image": InputParam("mask_image", type_hint=PipelineImageInput, required=True, description="Mask image for inpainting, white pixels will be repainted"),
    "generator": InputParam("generator", type_hint=Optional[Union[torch.Generator, List[torch.Generator]]], description="Generator(s) for deterministic generation"),
    "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"),
    "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"),
    "num_images_per_prompt": InputParam("num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt"),
    "num_inference_steps": InputParam("num_inference_steps", type_hint=int, default=50, description="Number of denoising steps"),
    "timesteps": InputParam("timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process"),
    "sigmas": InputParam("sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process"),
    "denoising_end": InputParam("denoising_end", type_hint=Optional[float], description="Fraction of denoising process to complete before termination"),
    # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999
    "strength": InputParam("strength", type_hint=float, default=0.3, description="How much to transform the reference image"),
    "denoising_start": InputParam("denoising_start", type_hint=Optional[float], description="Starting point of the denoising process"),
    "latents": InputParam("latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation"),
    "padding_mask_crop": InputParam("padding_mask_crop", type_hint=Optional[Tuple[int, int]], description="Size of margin in crop for image and mask"),
    "original_size": InputParam("original_size", type_hint=Optional[Tuple[int, int]], description="Original size of the image for SDXL's micro-conditioning"),
    "target_size": InputParam("target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning"),
    "negative_original_size": InputParam("negative_original_size", type_hint=Optional[Tuple[int, int]], description="Negative conditioning based on image resolution"),
    "negative_target_size": InputParam("negative_target_size", type_hint=Optional[Tuple[int, int]], description="Negative conditioning based on target resolution"),
    "crops_coords_top_left": InputParam("crops_coords_top_left", type_hint=Tuple[int, int], default=(0, 0), description="Top-left coordinates for SDXL's micro-conditioning"),
    "negative_crops_coords_top_left": InputParam("negative_crops_coords_top_left", type_hint=Tuple[int, int], default=(0, 0), description="Negative conditioning crop coordinates"),
    "aesthetic_score": InputParam("aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"),
    "negative_aesthetic_score": InputParam("negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"),
    "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
    "output_type": InputParam("output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"),
    "ip_adapter_image": InputParam("ip_adapter_image", type_hint=PipelineImageInput, required=True, description="Image(s) to be used as IP adapter"),
    "control_image": InputParam("control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition"),
    "control_guidance_start": InputParam("control_guidance_start", type_hint=Union[float, List[float]], default=0.0, description="When ControlNet starts applying"),
    "control_guidance_end": InputParam("control_guidance_end", type_hint=Union[float, List[float]], default=1.0, description="When ControlNet stops applying"),
    "controlnet_conditioning_scale": InputParam("controlnet_conditioning_scale", type_hint=Union[float, List[float]], default=1.0, description="Scale factor for ControlNet outputs"),
    "guess_mode": InputParam("guess_mode", type_hint=bool, default=False, description="Enables ControlNet encoder to recognize input without prompts"),
    "control_mode": InputParam("control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet")
 }
 SDXL_INTERMEDIATE_INPUTS_SCHEMA = {
    "prompt_embeds": InputParam("prompt_embeds", type_hint=torch.Tensor, required=True, description="Text embeddings used to guide image generation"),
    "negative_prompt_embeds": InputParam("negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"),
    "pooled_prompt_embeds": InputParam("pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings"),
    "negative_pooled_prompt_embeds": InputParam("negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"),
    "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"),
    "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
    "preprocess_kwargs": InputParam("preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor"),
    "latents": InputParam("latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process"),
    "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"),
    "num_inference_steps": InputParam("num_inference_steps", type_hint=int, required=True, description="Number of denoising steps"),
    "latent_timestep": InputParam("latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep"),
    "image_latents": InputParam("image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image"),
    "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"),
    "masked_image_latents": InputParam("masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"),
    "add_time_ids": InputParam("add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning"),
    "negative_add_time_ids": InputParam("negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"),
    "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
    "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
    "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
    "ip_adapter_embeds": InputParam("ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"),
    "negative_ip_adapter_embeds": InputParam("negative_ip_adapter_embeds", type_hint=List[torch.Tensor], description="Negative image embeddings for IP-Adapter"),
    "images": InputParam("images", type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], required=True, description="Generated images")
 }
 SDXL_PARAM_SCHEMA = {**SDXL_INPUTS_SCHEMA, **SDXL_INTERMEDIATE_INPUTS_SCHEMA}
 DEFAULT_PARAM_MAPS = {
    "prompt": {
        "label": "Prompt",
        "type": "string",
        "default": "a bear sitting in a chair drinking a milkshake",
        "display": "textarea",
    },
    "negative_prompt": {
        "label": "Negative Prompt",
        "type": "string",
        "default": "deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
        "display": "textarea",
    },
    "num_inference_steps": {
        "label": "Steps",
        "type": "int",
        "default": 25,
        "min": 1,
        "max": 1000,
    },
    "seed": {
        "label": "Seed",
        "type": "int",
        "default": 0,
        "min": 0,
        "display": "random",
    },
    "width": {
        "label": "Width",
        "type": "int",
        "display": "text",
        "default": 1024,
        "min": 8,
        "max": 8192,
        "step": 8,
        "group": "dimensions",
    },
    "height": {
        "label": "Height",
        "type": "int",
        "display": "text",
        "default": 1024,
        "min": 8,
        "max": 8192,
        "step": 8,
        "group": "dimensions",
    },
    "images": {
        "label": "Images",
        "type": "image",
        "display": "output",
    },
    "image": {
        "label": "Image",
        "type": "image",
        "display": "input",
    },
 }
 DEFAULT_TYPE_MAPS ={
    "int": {
        "type": "int",
        "default": 0,
        "min": 0,
    },
    "float": {
        "type": "float",
        "default": 0.0,
        "min": 0.0,
    },
    "str": {
        "type": "string",
        "default": "",
    },
    "bool": {
        "type": "boolean",
        "default": False,
    },
    "image": {
        "type": "image",
    },
 }
 DEFAULT_MODEL_KEYS = ["unet", "vae", "text_encoder", "tokenizer", "controlnet", "transformer", "image_encoder"]
 DEFAULT_CATEGORY = "Modular Diffusers"
 DEFAULT_EXCLUDE_MODEL_KEYS = ["processor", "feature_extractor", "safety_checker"]
 DEFAULT_PARAMS_GROUPS_KEYS = {
    "text_encoders": ["text_encoder", "tokenizer"],
    "ip_adapter_embeds": ["ip_adapter_embeds"],
    "prompt_embeddings": ["prompt_embeds"],
 }
 def get_group_name(name, group_params_keys=DEFAULT_PARAMS_GROUPS_KEYS):
    """
    Get the group name for a given parameter name, if not part of a group, return None
    e.g. "prompt_embeds" -> "text_embeds", "text_encoder" -> "text_encoders", "prompt" -> None
    """
    if name is None:
        return None
    for group_name, group_keys in group_params_keys.items():
        for group_key in group_keys:
            if group_key in name:
                return group_name
    return None
 class ModularNode(ConfigMixin):
    config_name = "node_config.json"
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: str,
        trust_remote_code: Optional[bool] = None,
        **kwargs,
    ):
        blocks = ModularPipelineBlocks.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs)
        return cls(blocks, **kwargs)
    def __init__(self, blocks, category=DEFAULT_CATEGORY, label=None, **kwargs):
        self.blocks = blocks
        if label is None:
            label = self.blocks.__class__.__name__
        # blocks param name -> mellon param name
        self.name_mapping = {}
        input_params = {}
        # pass or create a default param dict for each input 
        # e.g. for prompt,
        #       prompt = {
        #               "name": "text_input", # the name of the input in node defination, could be different from the input name in diffusers
        #               "label": "Prompt", 
        #               "type": "string", 
        #               "default": "a bear sitting in a chair drinking a milkshake", 
        #               "display": "textarea"}  
        # if type is not specified, it'll be a "custom" param of its own type 
        # e.g. you can pass ModularNode(scheduler = {name :"scheduler"})
        #  it will get this spec in node defination {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}}
        #  name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}}
        inputs = self.blocks.inputs + self.blocks.intermediates_inputs
        for inp in inputs:
            param = kwargs.pop(inp.name, None)
            if param:
                # user can pass a param dict for all inputs, e.g. ModularNode(prompt = {...})
                input_params[inp.name] = param
                mellon_name = param.pop("name", inp.name)
                if mellon_name != inp.name:
                    self.name_mapping[inp.name] = mellon_name
                continue
            if not inp.name in DEFAULT_PARAM_MAPS and not inp.required and not get_group_name(inp.name):
                continue
            if inp.name in DEFAULT_PARAM_MAPS:
                # first check if it's in the default param map, if so, directly use that
                param = DEFAULT_PARAM_MAPS[inp.name].copy()
            elif get_group_name(inp.name):
                param = get_group_name(inp.name)
                if inp.name not in self.name_mapping:
                    self.name_mapping[inp.name] = param
            else:
                # if not, check if it's in the SDXL input schema, if so, 
                    # 1. use the type hint to determine the type
                    # 2. use the default param dict for the type e.g. if "steps" is a "int" type, {"steps": {"type": "int", "default": 0, "min": 0}}
                if inp.type_hint is not None:
                    type_str = str(inp.type_hint).lower()
                else:
                    inp_spec = SDXL_PARAM_SCHEMA.get(inp.name, None)
                    type_str = str(inp_spec.type_hint).lower() if inp_spec else ""
                for type_key, type_param in DEFAULT_TYPE_MAPS.items():
                    if type_key in type_str:
                        param = type_param.copy()
                        param["label"] = inp.name
                        param["display"] = "input"
                        break
                else:
                    param = inp.name
            # add the param dict to the inp_params dict
            input_params[inp.name] = param
        component_params = {}
        for comp in self.blocks.expected_components:
            param = kwargs.pop(comp.name, None)
            if param:
                component_params[comp.name] = param
                mellon_name = param.pop("name", comp.name)
                if mellon_name != comp.name:
                    self.name_mapping[comp.name] = mellon_name
                continue
            to_exclude = False
            for exclude_key in DEFAULT_EXCLUDE_MODEL_KEYS:
                if exclude_key in comp.name:
                    to_exclude = True
                    break
            if to_exclude:
                continue
            if get_group_name(comp.name):
                param = get_group_name(comp.name)
                if comp.name not in self.name_mapping:
                    self.name_mapping[comp.name] = param
            elif comp.name in DEFAULT_MODEL_KEYS:
                param = {"label": comp.name, "type": "diffusers_auto_model", "display": "input"}
            else:
                param = comp.name
            # add the param dict to the model_params dict
            component_params[comp.name] = param
        output_params = {}
        if isinstance(self.blocks, SequentialPipelineBlocks):
            last_block_name = list(self.blocks.blocks.keys())[-1]
            outputs = self.blocks.blocks[last_block_name].intermediates_outputs
        else:
            outputs = self.blocks.intermediates_outputs
        for out in outputs:
            param = kwargs.pop(out.name, None)
            if param:
                output_params[out.name] = param
                mellon_name = param.pop("name", out.name)
                if mellon_name != out.name:
                    self.name_mapping[out.name] = mellon_name
                continue
            if out.name in DEFAULT_PARAM_MAPS:
                param = DEFAULT_PARAM_MAPS[out.name].copy()
                param["display"] = "output"
            else:
                group_name = get_group_name(out.name)
                if group_name:
                    param = group_name
                    if out.name not in self.name_mapping:
                        self.name_mapping[out.name] = param
                else:
                    param = out.name
            # add the param dict to the outputs dict
            output_params[out.name] = param
        if len(kwargs) > 0:
            logger.warning(f"Unused kwargs: {kwargs}")
        register_dict = {
            "category": category,
            "label": label,
            "input_params": input_params,
            "component_params": component_params,
            "output_params": output_params,
            "name_mapping": self.name_mapping,
        }
        self.register_to_config(**register_dict)
    def setup(self, components, collection=None):
        self.blocks.setup_loader(component_manager=components, collection=collection)
        self._components_manager = components
    @property
    def mellon_config(self):
        return self._convert_to_mellon_config()
    def _convert_to_mellon_config(self):
        node = {}
        node["label"] = self.config.label
        node["category"] = self.config.category
        node_param = {}
        for inp_name, inp_param in self.config.input_params.items():
            if inp_name in self.name_mapping:
                mellon_name = self.name_mapping[inp_name]
            else:
                mellon_name = inp_name
            if isinstance(inp_param, str):
                param = {
                    "label": inp_param,
                    "type": inp_param,
                    "display": "input",
                }
            else:
                param = inp_param
            if mellon_name not in node_param:
                node_param[mellon_name] = param
            else:
                logger.debug(f"Input param {mellon_name} already exists in node_param, skipping {inp_name}")
        for comp_name, comp_param in self.config.component_params.items():
            if comp_name in self.name_mapping:
                mellon_name = self.name_mapping[comp_name]
            else:
                mellon_name = comp_name
            if isinstance(comp_param, str):
                param = {
                    "label": comp_param,
                    "type": comp_param,
                    "display": "input",
                }
            else:
                param = comp_param
            if mellon_name not in node_param:
                node_param[mellon_name] = param
            else:
                logger.debug(f"Component param {comp_param} already exists in node_param, skipping {comp_name}")
        for out_name, out_param in self.config.output_params.items():
            if out_name in self.name_mapping:
                mellon_name = self.name_mapping[out_name]
            else:
                mellon_name = out_name
            if isinstance(out_param, str):
                param = {
                    "label": out_param,
                    "type": out_param,
                    "display": "output",
                }
            else:
                param = out_param
            if mellon_name not in node_param:
                node_param[mellon_name] = param
            else:
                logger.debug(f"Output param {out_param} already exists in node_param, skipping {out_name}")
        node["params"] = node_param
        return node
    def save_mellon_config(self, file_path):
        """
        Save the Mellon configuration to a JSON file.
        Args:
            file_path (str or Path): Path where the JSON file will be saved
        Returns:
            Path: Path to the saved config file
        """
        file_path = Path(file_path)
        # Create directory if it doesn't exist
        os.makedirs(file_path.parent, exist_ok=True)
        # Create a combined dictionary with module definition and name mapping
        config = {
            "module": self.mellon_config,
            "name_mapping": self.name_mapping
        }
        # Save the config to file
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2)
        logger.info(f"Mellon config and name mapping saved to {file_path}")
        return file_path
    @classmethod
    def load_mellon_config(cls, file_path):
        """
        Load a Mellon configuration from a JSON file.
        Args:
            file_path (str or Path): Path to the JSON file containing Mellon config
        Returns:
            dict: The loaded combined configuration containing 'module' and 'name_mapping'
        """
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"Config file not found: {file_path}")
        with open(file_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        logger.info(f"Mellon config loaded from {file_path}")
        return config
    def process_inputs(self, **kwargs):
        params_components = {}
        for comp_name, comp_param in self.config.component_params.items():
            logger.debug(f"component: {comp_name}")
            mellon_comp_name = self.name_mapping.get(comp_name, comp_name)
            if mellon_comp_name in kwargs:
                if isinstance(kwargs[mellon_comp_name], dict) and comp_name in kwargs[mellon_comp_name]:
                    comp = kwargs[mellon_comp_name].pop(comp_name)
                else:
                    comp = kwargs.pop(mellon_comp_name)
                if comp:
                    params_components[comp_name] = self._components_manager.get_one(comp["model_id"])
        params_run = {}
        for inp_name, inp_param in self.config.input_params.items():
            logger.debug(f"input: {inp_name}")
            mellon_inp_name = self.name_mapping.get(inp_name, inp_name)
            if mellon_inp_name in kwargs:
                if isinstance(kwargs[mellon_inp_name], dict) and inp_name in kwargs[mellon_inp_name]:
                    inp = kwargs[mellon_inp_name].pop(inp_name)
                else:
                    inp = kwargs.pop(mellon_inp_name)
                if inp is not None:
                    params_run[inp_name] = inp
        return_output_names = list(self.config.output_params.keys())
        return params_components, params_run, return_output_names
    def execute(self, **kwargs):
        params_components, params_run, return_output_names = self.process_inputs(**kwargs)
        self.blocks.loader.update(**params_components)
        output = self.blocks.run(**params_run, output=return_output_names)
        return output
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/init.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/init.py
@@ -0,0 +1,53 @@
 from typing import TYPE_CHECKING
 from ...utils import (
    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
    is_torch_available,
    is_transformers_available,
 )
 _dummy_objects = {}
 _import_structure = {}
 try:
    if not (is_transformers_available() and is_torch_available()):
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
    _import_structure["modular_pipeline_presets"] = ["StableDiffusionXLAutoPipeline"]
    _import_structure["modular_loader"] = ["StableDiffusionXLModularLoader"]
    _import_structure["encoders"] = ["StableDiffusionXLAutoIPAdapterStep", "StableDiffusionXLTextEncoderStep", "StableDiffusionXLAutoVaeEncoderStep"]
    _import_structure["decoders"] = ["StableDiffusionXLAutoDecodeStep"]
    _import_structure["modular_block_mappings"] = ["TEXT2IMAGE_BLOCKS", "IMAGE2IMAGE_BLOCKS", "INPAINT_BLOCKS", "CONTROLNET_BLOCKS", "CONTROLNET_UNION_BLOCKS", "IP_ADAPTER_BLOCKS", "AUTO_BLOCKS", "SDXL_SUPPORTED_BLOCKS"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
        from .modular_pipeline_presets import StableDiffusionXLAutoPipeline
        from .modular_loader import StableDiffusionXLModularLoader
        from .encoders import StableDiffusionXLAutoIPAdapterStep, StableDiffusionXLTextEncoderStep, StableDiffusionXLAutoVaeEncoderStep
        from .decoders import StableDiffusionXLAutoDecodeStep
        from .modular_block_mappings import SDXL_SUPPORTED_BLOCKS, TEXT2IMAGE_BLOCKS, IMAGE2IMAGE_BLOCKS, INPAINT_BLOCKS, CONTROLNET_BLOCKS, CONTROLNET_UNION_BLOCKS, IP_ADAPTER_BLOCKS, AUTO_BLOCKS
 else:
    import sys
    sys.modules[__name__] = _LazyModule(
        __name__,
        globals()["__file__"],
        _import_structure,
        module_spec=__spec__,
    )
    for name, value in _dummy_objects.items():
        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
@@ -0,0 +1,215 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
 from typing import Any, List, Optional, Tuple, Union, Dict
 import PIL
 import torch
 import numpy as np
 from collections import OrderedDict
 from ...image_processor import VaeImageProcessor, PipelineImageInput
 from ...models import AutoencoderKL
 from ...models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
 from ...utils import logging
 from ...pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from ...configuration_utils import FrozenDict
 from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
 from ..modular_pipeline import (
    AutoPipelineBlocks,
    PipelineBlock,
    PipelineState,
    SequentialPipelineBlocks,
 )
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class StableDiffusionXLDecodeStep(PipelineBlock):
    model_name = "stable-diffusion-xl"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("vae", AutoencoderKL),
            ComponentSpec(
                "image_processor", 
                VaeImageProcessor, 
                config=FrozenDict({"vae_scale_factor": 8}), 
                default_creation_method="from_config"),
        ]
    @property
    def description(self) -> str:
        return "Step that decodes the denoised latents into images"
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        return [
            InputParam("output_type", default="pil"),
        ]
    @property
    def intermediates_inputs(self) -> List[str]:
        return [InputParam("latents", required=True, type_hint=torch.Tensor, description="The denoised latents from the denoising step")]
    @property
    def intermediates_outputs(self) -> List[str]:
        return [OutputParam("images", type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array")]
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae with self -> components
    @staticmethod
    def upcast_vae(components):
        dtype = components.vae.dtype
        components.vae.to(dtype=torch.float32)
        use_torch_2_0_or_xformers = isinstance(
            components.vae.decoder.mid_block.attentions[0].processor,
            (
                AttnProcessor2_0,
                XFormersAttnProcessor,
            ),
        )
        # if xformers or torch_2_0 is used attention block does not need
        # to be in float32 which can save lots of memory
        if use_torch_2_0_or_xformers:
            components.vae.post_quant_conv.to(dtype)
            components.vae.decoder.conv_in.to(dtype)
            components.vae.decoder.mid_block.to(dtype)
    @torch.no_grad()
    def __call__(self, components, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        if not block_state.output_type == "latent":
            latents = block_state.latents
            # make sure the VAE is in float32 mode, as it overflows in float16
            block_state.needs_upcasting = components.vae.dtype == torch.float16 and components.vae.config.force_upcast
            if block_state.needs_upcasting:
                self.upcast_vae(components)
                latents = latents.to(next(iter(components.vae.post_quant_conv.parameters())).dtype)
            elif latents.dtype != components.vae.dtype:
                if torch.backends.mps.is_available():
                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
                    components.vae = components.vae.to(latents.dtype)
            # unscale/denormalize the latents
            # denormalize with the mean and std if available and not None
            block_state.has_latents_mean = (
                hasattr(components.vae.config, "latents_mean") and components.vae.config.latents_mean is not None
            )
            block_state.has_latents_std = (
                hasattr(components.vae.config, "latents_std") and components.vae.config.latents_std is not None
            )
            if block_state.has_latents_mean and block_state.has_latents_std:
                block_state.latents_mean = (
                    torch.tensor(components.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
                )
                block_state.latents_std = (
                    torch.tensor(components.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
                )
                latents = latents * block_state.latents_std / components.vae.config.scaling_factor + block_state.latents_mean
            else:
                latents = latents / components.vae.config.scaling_factor
            block_state.images = components.vae.decode(latents, return_dict=False)[0]
            # cast back to fp16 if needed
            if block_state.needs_upcasting:
                components.vae.to(dtype=torch.float16)
        else:
            block_state.images = block_state.latents
        # apply watermark if available
        if hasattr(components, "watermark") and components.watermark is not None:
            block_state.images = components.watermark.apply_watermark(block_state.images)
        block_state.images = components.image_processor.postprocess(block_state.images, output_type=block_state.output_type)
        self.add_block_state(state, block_state)
        return components, state
 class StableDiffusionXLInpaintOverlayMaskStep(PipelineBlock):
    model_name = "stable-diffusion-xl"
    @property
    def description(self) -> str:
        return "A post-processing step that overlays the mask on the image (inpainting task only).\n" + \
               "only needed when you are using the `padding_mask_crop` option when pre-processing the image and mask"
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        return [
            InputParam("image", required=True),
            InputParam("mask_image", required=True), 
            InputParam("padding_mask_crop"),
        ]
    @property
    def intermediates_inputs(self) -> List[str]:
        return [
            InputParam("images", required=True, type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], description="The generated images from the decode step"),
            InputParam("crops_coords", required=True, type_hint=Tuple[int, int], description="The crop coordinates to use for preprocess/postprocess the image and mask, for inpainting task only. Can be generated in vae_encode step.")
        ]
    @property
    def intermediates_outputs(self) -> List[str]:
        return [OutputParam("images", type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], description="The generated images with the mask overlayed")]
    @torch.no_grad()
    def __call__(self, components, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        if block_state.padding_mask_crop is not None and block_state.crops_coords is not None:
            block_state.images = [components.image_processor.apply_overlay(block_state.mask_image, block_state.image, i, block_state.crops_coords) for i in block_state.images]
        self.add_block_state(state, block_state)
        return components, state
 class StableDiffusionXLInpaintDecodeStep(SequentialPipelineBlocks):
    block_classes = [StableDiffusionXLDecodeStep, StableDiffusionXLInpaintOverlayMaskStep]
    block_names = ["decode", "mask_overlay"]
    @property
    def description(self):
        return "Inpaint decode step that decode the denoised latents into images outputs.\n" + \
               "This is a sequential pipeline blocks:\n" + \
               " - `StableDiffusionXLDecodeStep` is used to decode the denoised latents into images\n" + \
               " - `StableDiffusionXLInpaintOverlayMaskStep` is used to overlay the mask on the image"
 class StableDiffusionXLAutoDecodeStep(AutoPipelineBlocks):
    block_classes = [StableDiffusionXLInpaintDecodeStep, StableDiffusionXLDecodeStep]
    block_names = ["inpaint", "non-inpaint"]
    block_trigger_inputs = ["padding_mask_crop", None]
    @property
    def description(self):
        return "Decode step that decode the denoised latents into images outputs.\n" + \
               "This is an auto pipeline block that works for inpainting and non-inpainting tasks.\n" + \
               " - `StableDiffusionXLInpaintDecodeStep` (inpaint) is used when `padding_mask_crop` is provided.\n" + \
               " - `StableDiffusionXLDecodeStep` (non-inpaint) is used when `padding_mask_crop` is not provided."
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
@@ -0,0 +1,858 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
 from typing import Any, List, Optional, Tuple, Union, Dict
 import PIL
 import torch
 from collections import OrderedDict
 from ...image_processor import VaeImageProcessor, PipelineImageInput
 from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, ModularIPAdapterMixin
 from ...models import ControlNetModel, ImageProjection, UNet2DConditionModel, AutoencoderKL, ControlNetUnionModel
 from ...models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...utils import (
    USE_PEFT_BACKEND,
    logging,
    scale_lora_layers,
    unscale_lora_layers,
 )
 from ...utils.torch_utils import randn_tensor, unwrap_module
 from ...pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from ...configuration_utils import FrozenDict
 from transformers import (
    CLIPTextModel,
    CLIPImageProcessor,
    CLIPTextModelWithProjection,
    CLIPTokenizer,
    CLIPVisionModelWithProjection,
 )
 from ...schedulers import EulerDiscreteScheduler
 from ...guiders import ClassifierFreeGuidance
 from .modular_loader import StableDiffusionXLModularLoader
 from ..modular_pipeline import PipelineBlock, PipelineState, AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam, ConfigSpec
 import numpy as np
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
 ):
    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
        return encoder_output.latent_dist.sample(generator)
    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
        return encoder_output.latent_dist.mode()
    elif hasattr(encoder_output, "latents"):
        return encoder_output.latents
    else:
        raise AttributeError("Could not access latents of provided encoder_output")
 class StableDiffusionXLIPAdapterStep(PipelineBlock):
    model_name = "stable-diffusion-xl"
    @property
    def description(self) -> str:
        return (
            "IP Adapter step that handles all the ip adapter related tasks: Load/unload ip adapter weights into unet, prepare ip adapter image embeddings, etc"
            " See [ModularIPAdapterMixin](https://huggingface.co/docs/diffusers/api/loaders/ip_adapter#diffusers.loaders.ModularIPAdapterMixin)"
            " for more details"
        )
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("image_encoder", CLIPVisionModelWithProjection),
            ComponentSpec("feature_extractor", CLIPImageProcessor, config=FrozenDict({"size": 224, "crop_size": 224}), default_creation_method="from_config"),
            ComponentSpec("unet", UNet2DConditionModel),
            ComponentSpec(
                "guider", 
                ClassifierFreeGuidance, 
                config=FrozenDict({"guidance_scale": 7.5}),
                default_creation_method="from_config"),
        ]
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(
                "ip_adapter_image", 
                PipelineImageInput, 
                required=True,
                description="The image(s) to be used as ip adapter"
            )
        ]
    @property
    def intermediates_outputs(self) -> List[OutputParam]:
        return [
            OutputParam("ip_adapter_embeds", type_hint=torch.Tensor, description="IP adapter image embeddings"),
            OutputParam("negative_ip_adapter_embeds", type_hint=torch.Tensor, description="Negative IP adapter image embeddings")
        ]
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image with self -> components
    @staticmethod
    def encode_image(components, image, device, num_images_per_prompt, output_hidden_states=None):
        dtype = next(components.image_encoder.parameters()).dtype
        if not isinstance(image, torch.Tensor):
            image = components.feature_extractor(image, return_tensors="pt").pixel_values
        image = image.to(device=device, dtype=dtype)
        if output_hidden_states:
            image_enc_hidden_states = components.image_encoder(image, output_hidden_states=True).hidden_states[-2]
            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
            uncond_image_enc_hidden_states = components.image_encoder(
                torch.zeros_like(image), output_hidden_states=True
            ).hidden_states[-2]
            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
                num_images_per_prompt, dim=0
            )
            return image_enc_hidden_states, uncond_image_enc_hidden_states
        else:
            image_embeds = components.image_encoder(image).image_embeds
            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
            uncond_image_embeds = torch.zeros_like(image_embeds)
            return image_embeds, uncond_image_embeds
    # modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
    def prepare_ip_adapter_image_embeds(
        self, components, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, prepare_unconditional_embeds
    ):
        image_embeds = []
        if prepare_unconditional_embeds:
            negative_image_embeds = []
        if ip_adapter_image_embeds is None:
            if not isinstance(ip_adapter_image, list):
                ip_adapter_image = [ip_adapter_image]
            if len(ip_adapter_image) != len(components.unet.encoder_hid_proj.image_projection_layers):
                raise ValueError(
                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(components.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
                )
            for single_ip_adapter_image, image_proj_layer in zip(
                ip_adapter_image, components.unet.encoder_hid_proj.image_projection_layers
            ):
                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
                single_image_embeds, single_negative_image_embeds = self.encode_image(
                    components, single_ip_adapter_image, device, 1, output_hidden_state
                )
                image_embeds.append(single_image_embeds[None, :])
                if prepare_unconditional_embeds:
                    negative_image_embeds.append(single_negative_image_embeds[None, :])
        else:
            for single_image_embeds in ip_adapter_image_embeds:
                if prepare_unconditional_embeds:
                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
                    negative_image_embeds.append(single_negative_image_embeds)
                image_embeds.append(single_image_embeds)
        ip_adapter_image_embeds = []
        for i, single_image_embeds in enumerate(image_embeds):
            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
            if prepare_unconditional_embeds:
                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
            single_image_embeds = single_image_embeds.to(device=device)
            ip_adapter_image_embeds.append(single_image_embeds)
        return ip_adapter_image_embeds
    @torch.no_grad()
    def __call__(self, components: StableDiffusionXLModularLoader, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        block_state.prepare_unconditional_embeds = components.guider.num_conditions > 1
        block_state.device = components._execution_device
        block_state.ip_adapter_embeds = self.prepare_ip_adapter_image_embeds(
            components,
            ip_adapter_image=block_state.ip_adapter_image,
            ip_adapter_image_embeds=None,
            device=block_state.device,
            num_images_per_prompt=1,
            prepare_unconditional_embeds=block_state.prepare_unconditional_embeds,
        )
        if block_state.prepare_unconditional_embeds:
            block_state.negative_ip_adapter_embeds = []
            for i, image_embeds in enumerate(block_state.ip_adapter_embeds):
                negative_image_embeds, image_embeds = image_embeds.chunk(2)
                block_state.negative_ip_adapter_embeds.append(negative_image_embeds)
                block_state.ip_adapter_embeds[i] = image_embeds
        self.add_block_state(state, block_state)
        return components, state
 class StableDiffusionXLTextEncoderStep(PipelineBlock):
    model_name = "stable-diffusion-xl"
    @property
    def description(self) -> str:
        return(
            "Text Encoder step that generate text_embeddings to guide the image generation"
        )
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("text_encoder", CLIPTextModel),
            ComponentSpec("text_encoder_2", CLIPTextModelWithProjection),
            ComponentSpec("tokenizer", CLIPTokenizer),
            ComponentSpec("tokenizer_2", CLIPTokenizer),
            ComponentSpec(
                "guider", 
                ClassifierFreeGuidance, 
                config=FrozenDict({"guidance_scale": 7.5}), 
                default_creation_method="from_config"),
        ]
    @property
    def expected_configs(self) -> List[ConfigSpec]:
        return [ConfigSpec("force_zeros_for_empty_prompt", True)]
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("prompt"),
            InputParam("prompt_2"),
            InputParam("negative_prompt"),
            InputParam("negative_prompt_2"),
            InputParam("cross_attention_kwargs"),
            InputParam("clip_skip"),
        ]
    @property
    def intermediates_outputs(self) -> List[OutputParam]:
        return [
            OutputParam("prompt_embeds", type_hint=torch.Tensor, kwargs_type="guider_input_fields",description="text embeddings used to guide the image generation"),
            OutputParam("negative_prompt_embeds", type_hint=torch.Tensor, kwargs_type="guider_input_fields", description="negative text embeddings used to guide the image generation"),
            OutputParam("pooled_prompt_embeds", type_hint=torch.Tensor, kwargs_type="guider_input_fields", description="pooled text embeddings used to guide the image generation"),
            OutputParam("negative_pooled_prompt_embeds", type_hint=torch.Tensor, kwargs_type="guider_input_fields", description="negative pooled text embeddings used to guide the image generation"),
        ]
    @staticmethod
    def check_inputs(block_state):
        if block_state.prompt is not None and (not isinstance(block_state.prompt, str) and not isinstance(block_state.prompt, list)):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
        elif block_state.prompt_2 is not None and (not isinstance(block_state.prompt_2, str) and not isinstance(block_state.prompt_2, list)):
            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(block_state.prompt_2)}")
    @staticmethod
    def encode_prompt(
        components,
        prompt: str,
        prompt_2: Optional[str] = None,
        device: Optional[torch.device] = None,
        num_images_per_prompt: int = 1,
        prepare_unconditional_embeds: bool = True,
        negative_prompt: Optional[str] = None,
        negative_prompt_2: Optional[str] = None,
        prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
        pooled_prompt_embeds: Optional[torch.Tensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
        lora_scale: Optional[float] = None,
        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prepare_unconditional_embeds (`bool`):
                whether to use prepare unconditional embeddings or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or components._execution_device
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
        if lora_scale is not None and isinstance(components, StableDiffusionXLLoraLoaderMixin):
            components._lora_scale = lora_scale
            # dynamically adjust the LoRA scale
            if components.text_encoder is not None:
                if not USE_PEFT_BACKEND:
                    adjust_lora_scale_text_encoder(components.text_encoder, lora_scale)
                else:
                    scale_lora_layers(components.text_encoder, lora_scale)
            if components.text_encoder_2 is not None:
                if not USE_PEFT_BACKEND:
                    adjust_lora_scale_text_encoder(components.text_encoder_2, lora_scale)
                else:
                    scale_lora_layers(components.text_encoder_2, lora_scale)
        prompt = [prompt] if isinstance(prompt, str) else prompt
        if prompt is not None:
            batch_size = len(prompt)
        else:
            batch_size = prompt_embeds.shape[0]
        # Define tokenizers and text encoders
        tokenizers = [components.tokenizer, components.tokenizer_2] if components.tokenizer is not None else [components.tokenizer_2]
        text_encoders = (
            [components.text_encoder, components.text_encoder_2] if components.text_encoder is not None else [components.text_encoder_2]
        )
        if prompt_embeds is None:
            prompt_2 = prompt_2 or prompt
            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
            # textual inversion: process multi-vector tokens if necessary
            prompt_embeds_list = []
            prompts = [prompt, prompt_2]
            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
                if isinstance(components, TextualInversionLoaderMixin):
                    prompt = components.maybe_convert_prompt(prompt, tokenizer)
                text_inputs = tokenizer(
                    prompt,
                    padding="max_length",
                    max_length=tokenizer.model_max_length,
                    truncation=True,
                    return_tensors="pt",
                )
                text_input_ids = text_inputs.input_ids
                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
                    text_input_ids, untruncated_ids
                ):
                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
                    logger.warning(
                        "The following part of your input was truncated because CLIP can only handle sequences up to"
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
                    # "2" because SDXL always indexes from the penultimate layer.
                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
        # get unconditional embeddings for classifier free guidance
        zero_out_negative_prompt = negative_prompt is None and components.config.force_zeros_for_empty_prompt
        if prepare_unconditional_embeds and negative_prompt_embeds is None and zero_out_negative_prompt:
            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
        elif prepare_unconditional_embeds and negative_prompt_embeds is None:
            negative_prompt = negative_prompt or ""
            negative_prompt_2 = negative_prompt_2 or negative_prompt
            # normalize str to list
            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
            negative_prompt_2 = (
                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
            )
            uncond_tokens: List[str]
            if prompt is not None and type(prompt) is not type(negative_prompt):
                raise TypeError(
                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
                    f" {type(prompt)}."
                )
            elif batch_size != len(negative_prompt):
                raise ValueError(
                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
                    " the batch size of `prompt`."
                )
            else:
                uncond_tokens = [negative_prompt, negative_prompt_2]
            negative_prompt_embeds_list = []
            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
                if isinstance(components, TextualInversionLoaderMixin):
                    negative_prompt = components.maybe_convert_prompt(negative_prompt, tokenizer)
                max_length = prompt_embeds.shape[1]
                uncond_input = tokenizer(
                    negative_prompt,
                    padding="max_length",
                    max_length=max_length,
                    truncation=True,
                    return_tensors="pt",
                )
                negative_prompt_embeds = text_encoder(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
                negative_prompt_embeds_list.append(negative_prompt_embeds)
            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
        if components.text_encoder_2 is not None:
            prompt_embeds = prompt_embeds.to(dtype=components.text_encoder_2.dtype, device=device)
        else:
            prompt_embeds = prompt_embeds.to(dtype=components.unet.dtype, device=device)
        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
        if prepare_unconditional_embeds:
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = negative_prompt_embeds.shape[1]
            if components.text_encoder_2 is not None:
                negative_prompt_embeds = negative_prompt_embeds.to(dtype=components.text_encoder_2.dtype, device=device)
            else:
                negative_prompt_embeds = negative_prompt_embeds.to(dtype=components.unet.dtype, device=device)
            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
            bs_embed * num_images_per_prompt, -1
        )
        if prepare_unconditional_embeds:
            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
                bs_embed * num_images_per_prompt, -1
            )
        if components.text_encoder is not None:
            if isinstance(components, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
                # Retrieve the original scale by scaling back the LoRA layers
                unscale_lora_layers(components.text_encoder, lora_scale)
        if components.text_encoder_2 is not None:
            if isinstance(components, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
                # Retrieve the original scale by scaling back the LoRA layers
                unscale_lora_layers(components.text_encoder_2, lora_scale)
        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
    @torch.no_grad()
    def __call__(self, components: StableDiffusionXLModularLoader, state: PipelineState) -> PipelineState:
        # Get inputs and intermediates
        block_state = self.get_block_state(state)
        self.check_inputs(block_state)
        block_state.prepare_unconditional_embeds = components.guider.num_conditions > 1
        block_state.device = components._execution_device
        # Encode input prompt
        block_state.text_encoder_lora_scale = (
            block_state.cross_attention_kwargs.get("scale", None) if block_state.cross_attention_kwargs is not None else None
        )
        (
            block_state.prompt_embeds,
            block_state.negative_prompt_embeds,
            block_state.pooled_prompt_embeds,
            block_state.negative_pooled_prompt_embeds,
        ) = self.encode_prompt(
            components,
            block_state.prompt,
            block_state.prompt_2,
            block_state.device,
            1,
            block_state.prepare_unconditional_embeds,
            block_state.negative_prompt,
            block_state.negative_prompt_2,
            prompt_embeds=None,
            negative_prompt_embeds=None,
            pooled_prompt_embeds=None,
            negative_pooled_prompt_embeds=None,
            lora_scale=block_state.text_encoder_lora_scale,
            clip_skip=block_state.clip_skip,
        )
        # Add outputs
        self.add_block_state(state, block_state)
        return components, state
 class StableDiffusionXLVaeEncoderStep(PipelineBlock):
    model_name = "stable-diffusion-xl"
    @property
    def description(self) -> str:
        return (
            "Vae Encoder step that encode the input image into a latent representation"
        )
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("vae", AutoencoderKL),
            ComponentSpec(
                "image_processor", 
                VaeImageProcessor, 
                config=FrozenDict({"vae_scale_factor": 8}), 
                default_creation_method="from_config"),
        ]
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("image", required=True),
            InputParam("height"),
            InputParam("width"),
        ]
    @property
    def intermediates_inputs(self) -> List[InputParam]:
        return [
            InputParam("generator"),
            InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"), 
            InputParam("preprocess_kwargs", type_hint=Optional[dict], description="A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]")]
    @property
    def intermediates_outputs(self) -> List[OutputParam]:
        return [OutputParam("image_latents", type_hint=torch.Tensor, description="The latents representing the reference image for image-to-image/inpainting generation")]
    # Modified from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline._encode_vae_image with self -> components
    # YiYi TODO: update the _encode_vae_image so that we can use #Coped from
    def _encode_vae_image(self, components, image: torch.Tensor, generator: torch.Generator):
        latents_mean = latents_std = None
        if hasattr(components.vae.config, "latents_mean") and components.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(components.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(components.vae.config, "latents_std") and components.vae.config.latents_std is not None:
            latents_std = torch.tensor(components.vae.config.latents_std).view(1, 4, 1, 1)
        dtype = image.dtype
        if components.vae.config.force_upcast:
            image = image.float()
            components.vae.to(dtype=torch.float32)
        if isinstance(generator, list):
            image_latents = [
                retrieve_latents(components.vae.encode(image[i : i + 1]), generator=generator[i])
                for i in range(image.shape[0])
            ]
            image_latents = torch.cat(image_latents, dim=0)
        else:
            image_latents = retrieve_latents(components.vae.encode(image), generator=generator)
        if components.vae.config.force_upcast:
            components.vae.to(dtype)
        image_latents = image_latents.to(dtype)
        if latents_mean is not None and latents_std is not None:
            latents_mean = latents_mean.to(device=image_latents.device, dtype=dtype)
            latents_std = latents_std.to(device=image_latents.device, dtype=dtype)
            image_latents = (image_latents - latents_mean) * components.vae.config.scaling_factor / latents_std
        else:
            image_latents = components.vae.config.scaling_factor * image_latents
        return image_latents 
    @torch.no_grad()
    def __call__(self, components: StableDiffusionXLModularLoader, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        block_state.preprocess_kwargs = block_state.preprocess_kwargs or {}
        block_state.device = components._execution_device
        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
        block_state.image = components.image_processor.preprocess(block_state.image, height=block_state.height, width=block_state.width, **block_state.preprocess_kwargs)
        block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype)
        block_state.batch_size = block_state.image.shape[0]
        # if generator is a list, make sure the length of it matches the length of images (both should be batch_size)
        if isinstance(block_state.generator, list) and len(block_state.generator) != block_state.batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
                f" size of {block_state.batch_size}. Make sure the batch size matches the length of the generators."
            )
        block_state.image_latents = self._encode_vae_image(components, image=block_state.image, generator=block_state.generator)
        self.add_block_state(state, block_state)
        return components, state
 class StableDiffusionXLInpaintVaeEncoderStep(PipelineBlock):
    model_name = "stable-diffusion-xl"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("vae", AutoencoderKL),
            ComponentSpec(
                "image_processor", 
                VaeImageProcessor, 
                config=FrozenDict({"vae_scale_factor": 8}), 
                default_creation_method="from_config"),
            ComponentSpec(
                "mask_processor", 
                VaeImageProcessor, 
                config=FrozenDict({"do_normalize": False, "vae_scale_factor": 8, "do_binarize": True, "do_convert_grayscale": True}),
                default_creation_method="from_config"),
        ]
    @property
    def description(self) -> str:
        return (
            "Vae encoder step that prepares the image and mask for the inpainting process"
        )
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("height"),
            InputParam("width"),
            InputParam("image", required=True),
            InputParam("mask_image", required=True),
            InputParam("padding_mask_crop"),
        ]
    @property
    def intermediates_inputs(self) -> List[InputParam]:
        return [
            InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"),
            InputParam("generator"),
        ]
    @property
    def intermediates_outputs(self) -> List[OutputParam]:
        return [OutputParam("image_latents", type_hint=torch.Tensor, description="The latents representation of the input image"), 
                OutputParam("mask", type_hint=torch.Tensor, description="The mask to use for the inpainting process"), 
                OutputParam("masked_image_latents", type_hint=torch.Tensor, description="The masked image latents to use for the inpainting process (only for inpainting-specifid unet)"), 
                OutputParam("crops_coords", type_hint=Optional[Tuple[int, int]], description="The crop coordinates to use for the preprocess/postprocess of the image and mask")]
    # Modified from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline._encode_vae_image with self -> components
    # YiYi TODO: update the _encode_vae_image so that we can use #Coped from
    def _encode_vae_image(self, components, image: torch.Tensor, generator: torch.Generator):
        latents_mean = latents_std = None
        if hasattr(components.vae.config, "latents_mean") and components.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(components.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(components.vae.config, "latents_std") and components.vae.config.latents_std is not None:
            latents_std = torch.tensor(components.vae.config.latents_std).view(1, 4, 1, 1)
        dtype = image.dtype
        if components.vae.config.force_upcast:
            image = image.float()
            components.vae.to(dtype=torch.float32)
        if isinstance(generator, list):
            image_latents = [
                retrieve_latents(components.vae.encode(image[i : i + 1]), generator=generator[i])
                for i in range(image.shape[0])
            ]
            image_latents = torch.cat(image_latents, dim=0)
        else:
            image_latents = retrieve_latents(components.vae.encode(image), generator=generator)
        if components.vae.config.force_upcast:
            components.vae.to(dtype)
        image_latents = image_latents.to(dtype)
        if latents_mean is not None and latents_std is not None:
            latents_mean = latents_mean.to(device=image_latents.device, dtype=dtype)
            latents_std = latents_std.to(device=image_latents.device, dtype=dtype)
            image_latents = (image_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
        else:
            image_latents = components.vae.config.scaling_factor * image_latents
        return image_latents 
    # modified from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline.prepare_mask_latents
    # do not accept do_classifier_free_guidance
    def prepare_mask_latents(
        self, components, mask, masked_image, batch_size, height, width, dtype, device, generator
    ):
        # resize the mask to latents shape as we concatenate the mask to the latents
        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
        # and half precision
        mask = torch.nn.functional.interpolate(
            mask, size=(height // components.vae_scale_factor, width // components.vae_scale_factor)
        )
        mask = mask.to(device=device, dtype=dtype)
        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
        if mask.shape[0] < batch_size:
            if not batch_size % mask.shape[0] == 0:
                raise ValueError(
                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
                    " of masks that you pass is divisible by the total requested batch size."
                )
            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
        if masked_image is not None and masked_image.shape[1] == 4:
            masked_image_latents = masked_image
        else:
            masked_image_latents = None
        if masked_image is not None:
            if masked_image_latents is None:
                masked_image = masked_image.to(device=device, dtype=dtype)
                masked_image_latents = self._encode_vae_image(components, masked_image, generator=generator)
            if masked_image_latents.shape[0] < batch_size:
                if not batch_size % masked_image_latents.shape[0] == 0:
                    raise ValueError(
                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
                        " Make sure the number of images that you pass is divisible by the total requested batch size."
                    )
                masked_image_latents = masked_image_latents.repeat(
                    batch_size // masked_image_latents.shape[0], 1, 1, 1
                )
            # aligning device to prevent device errors when concating it with the latent model input
            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
        return mask, masked_image_latents
    @torch.no_grad()
    def __call__(self, components: StableDiffusionXLModularLoader, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
        block_state.device = components._execution_device
        if block_state.padding_mask_crop is not None:
            block_state.crops_coords = components.mask_processor.get_crop_region(block_state.mask_image, block_state.width, block_state.height, pad=block_state.padding_mask_crop)
            block_state.resize_mode = "fill"
        else:
            block_state.crops_coords = None
            block_state.resize_mode = "default"
        block_state.image = components.image_processor.preprocess(block_state.image, height=block_state.height, width=block_state.width, crops_coords=block_state.crops_coords, resize_mode=block_state.resize_mode)
        block_state.image = block_state.image.to(dtype=torch.float32)
        block_state.mask = components.mask_processor.preprocess(block_state.mask_image, height=block_state.height, width=block_state.width, resize_mode=block_state.resize_mode, crops_coords=block_state.crops_coords)
        block_state.masked_image = block_state.image * (block_state.mask < 0.5)
        block_state.batch_size = block_state.image.shape[0]
        block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype)
        block_state.image_latents = self._encode_vae_image(components, image=block_state.image, generator=block_state.generator)
        # 7. Prepare mask latent variables
        block_state.mask, block_state.masked_image_latents = self.prepare_mask_latents(
            components,
            block_state.mask,
            block_state.masked_image,
            block_state.batch_size,
            block_state.height,
            block_state.width,
            block_state.dtype,
            block_state.device,
            block_state.generator,
        )
        self.add_block_state(state, block_state)
        return components, state
 # auto blocks (YiYi TODO: maybe move all the auto blocks to a separate file)
 # Encode
 class StableDiffusionXLAutoVaeEncoderStep(AutoPipelineBlocks): 
    block_classes = [StableDiffusionXLInpaintVaeEncoderStep, StableDiffusionXLVaeEncoderStep]
    block_names = ["inpaint", "img2img"]
    block_trigger_inputs = ["mask_image", "image"]
    @property
    def description(self):
        return "Vae encoder step that encode the image inputs into their latent representations.\n" + \
               "This is an auto pipeline block that works for both inpainting and img2img tasks.\n" + \
               " - `StableDiffusionXLInpaintVaeEncoderStep` (inpaint) is used when both `mask_image` and `image` are provided.\n" + \
               " - `StableDiffusionXLVaeEncoderStep` (img2img) is used when only `image` is provided."
 class StableDiffusionXLAutoIPAdapterStep(AutoPipelineBlocks, ModularIPAdapterMixin):
    block_classes = [StableDiffusionXLIPAdapterStep]
    block_names = ["ip_adapter"]
    block_trigger_inputs = ["ip_adapter_image"]
    @property
    def description(self):
        return "Run IP Adapter step if `ip_adapter_image` is provided."
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_block_mappings.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_block_mappings.py
@@ -0,0 +1,121 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..modular_pipeline_utils import InsertableOrderedDict
 # Import all the necessary block classes
 from .denoise import (
    StableDiffusionXLAutoDenoiseStep,
    StableDiffusionXLControlNetDenoiseStep,
    StableDiffusionXLDenoiseLoop,
    StableDiffusionXLInpaintDenoiseLoop
 )
 from .before_denoise import (
    StableDiffusionXLAutoBeforeDenoiseStep,
    StableDiffusionXLInputStep,
    StableDiffusionXLSetTimestepsStep,
    StableDiffusionXLPrepareLatentsStep,
    StableDiffusionXLPrepareAdditionalConditioningStep,
    StableDiffusionXLImg2ImgSetTimestepsStep,
    StableDiffusionXLImg2ImgPrepareLatentsStep,
    StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
    StableDiffusionXLInpaintPrepareLatentsStep,
    StableDiffusionXLControlNetInputStep,
    StableDiffusionXLControlNetUnionInputStep
 )
 from .encoders import (
    StableDiffusionXLTextEncoderStep,
    StableDiffusionXLAutoIPAdapterStep,
    StableDiffusionXLAutoVaeEncoderStep,
    StableDiffusionXLVaeEncoderStep,
    StableDiffusionXLInpaintVaeEncoderStep,
    StableDiffusionXLIPAdapterStep
 )
 from .decoders import (
    StableDiffusionXLDecodeStep,
    StableDiffusionXLInpaintDecodeStep,
    StableDiffusionXLAutoDecodeStep
 )
 # YiYi notes: comment out for now, work on this later
 # block mapping 
 TEXT2IMAGE_BLOCKS = InsertableOrderedDict([
    ("text_encoder", StableDiffusionXLTextEncoderStep),
    ("input", StableDiffusionXLInputStep),
    ("set_timesteps", StableDiffusionXLSetTimestepsStep),
    ("prepare_latents", StableDiffusionXLPrepareLatentsStep),
    ("prepare_add_cond", StableDiffusionXLPrepareAdditionalConditioningStep),
    ("denoise", StableDiffusionXLDenoiseLoop),
    ("decode", StableDiffusionXLDecodeStep)
 ])
 IMAGE2IMAGE_BLOCKS = InsertableOrderedDict([
    ("text_encoder", StableDiffusionXLTextEncoderStep),
    ("image_encoder", StableDiffusionXLVaeEncoderStep),
    ("input", StableDiffusionXLInputStep),
    ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
    ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
    ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
    ("denoise", StableDiffusionXLDenoiseLoop),
    ("decode", StableDiffusionXLDecodeStep)
 ])
 INPAINT_BLOCKS = InsertableOrderedDict([
    ("text_encoder", StableDiffusionXLTextEncoderStep),
    ("image_encoder", StableDiffusionXLInpaintVaeEncoderStep),
    ("input", StableDiffusionXLInputStep),
    ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
    ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
    ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
    ("denoise", StableDiffusionXLInpaintDenoiseLoop),
    ("decode", StableDiffusionXLInpaintDecodeStep)
 ])
 CONTROLNET_BLOCKS = InsertableOrderedDict([
    ("controlnet_input", StableDiffusionXLControlNetInputStep),
    ("denoise", StableDiffusionXLControlNetDenoiseStep),
 ])
 CONTROLNET_UNION_BLOCKS = InsertableOrderedDict([
    ("controlnet_input", StableDiffusionXLControlNetUnionInputStep),
    ("denoise", StableDiffusionXLControlNetDenoiseStep),
 ])
 IP_ADAPTER_BLOCKS = InsertableOrderedDict([
    ("ip_adapter", StableDiffusionXLIPAdapterStep),
 ])
 AUTO_BLOCKS = InsertableOrderedDict([
    ("text_encoder", StableDiffusionXLTextEncoderStep),
    ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
    ("image_encoder", StableDiffusionXLAutoVaeEncoderStep),
    ("before_denoise", StableDiffusionXLAutoBeforeDenoiseStep),
    ("denoise", StableDiffusionXLAutoDenoiseStep),
    ("decode", StableDiffusionXLAutoDecodeStep)
 ])
 SDXL_SUPPORTED_BLOCKS = {
    "text2img": TEXT2IMAGE_BLOCKS,
    "img2img": IMAGE2IMAGE_BLOCKS,
    "inpaint": INPAINT_BLOCKS,
    "controlnet": CONTROLNET_BLOCKS,
    "controlnet_union": CONTROLNET_UNION_BLOCKS,
    "ip_adapter": IP_ADAPTER_BLOCKS,
    "auto": AUTO_BLOCKS
 }
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_loader.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_loader.py
@@ -0,0 +1,174 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Any, List, Optional, Tuple, Union, Dict
 import PIL
 import torch
 import numpy as np
 from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin, ModularIPAdapterMixin
 from ...image_processor import PipelineImageInput
 from ...pipelines.pipeline_utils import StableDiffusionMixin
 from ...pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from ...utils import logging
 from ..modular_pipeline import ModularLoader
 from ..modular_pipeline_utils import InputParam, OutputParam
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # YiYi TODO: move to a different file? stable_diffusion_xl_module should have its own folder?
 # YiYi Notes: model specific components:
 ## (1) it should inherit from ModularLoader
 ## (2) acts like a container that holds components and configs
 ## (3) define default config (related to components), e.g. default_sample_size, vae_scale_factor, num_channels_unet, num_channels_latents
 ## (4) inherit from model-specic loader class (e.g. StableDiffusionXLLoraLoaderMixin)
 ## (5) how to use together with Components_manager?
 class StableDiffusionXLModularLoader(
    ModularLoader,
    StableDiffusionMixin,
    TextualInversionLoaderMixin,
    StableDiffusionXLLoraLoaderMixin,
    ModularIPAdapterMixin,
 ):
    @property
    def default_sample_size(self):
        default_sample_size = 128
        if hasattr(self, "unet") and self.unet is not None:
            default_sample_size = self.unet.config.sample_size
        return default_sample_size
    @property
    def vae_scale_factor(self):
        vae_scale_factor = 8
        if hasattr(self, "vae") and self.vae is not None:
            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        return vae_scale_factor
    @property
    def num_channels_unet(self):
        num_channels_unet = 4
        if hasattr(self, "unet") and self.unet is not None:
            num_channels_unet = self.unet.config.in_channels
        return num_channels_unet
    @property
    def num_channels_latents(self):
        num_channels_latents = 4
        if hasattr(self, "vae") and self.vae is not None:
            num_channels_latents = self.vae.config.latent_channels
        return num_channels_latents
 # YiYi Notes: not used yet, maintain a list of schema that can be used across all pipeline blocks
 SDXL_INPUTS_SCHEMA = {
    "prompt": InputParam("prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"),
    "prompt_2": InputParam("prompt_2", type_hint=Union[str, List[str]], description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2"),
    "negative_prompt": InputParam("negative_prompt", type_hint=Union[str, List[str]], description="The prompt or prompts not to guide the image generation"),
    "negative_prompt_2": InputParam("negative_prompt_2", type_hint=Union[str, List[str]], description="The negative prompt or prompts for text_encoder_2"),
    "cross_attention_kwargs": InputParam("cross_attention_kwargs", type_hint=Optional[dict], description="Kwargs dictionary passed to the AttentionProcessor"),
    "clip_skip": InputParam("clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"),
    "image": InputParam("image", type_hint=PipelineImageInput, required=True, description="The image(s) to modify for img2img or inpainting"),
    "mask_image": InputParam("mask_image", type_hint=PipelineImageInput, required=True, description="Mask image for inpainting, white pixels will be repainted"),
    "generator": InputParam("generator", type_hint=Optional[Union[torch.Generator, List[torch.Generator]]], description="Generator(s) for deterministic generation"),
    "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"),
    "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"),
    "num_images_per_prompt": InputParam("num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt"),
    "num_inference_steps": InputParam("num_inference_steps", type_hint=int, default=50, description="Number of denoising steps"),
    "timesteps": InputParam("timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process"),
    "sigmas": InputParam("sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process"),
    "denoising_end": InputParam("denoising_end", type_hint=Optional[float], description="Fraction of denoising process to complete before termination"),
    # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999
    "strength": InputParam("strength", type_hint=float, default=0.3, description="How much to transform the reference image"),
    "denoising_start": InputParam("denoising_start", type_hint=Optional[float], description="Starting point of the denoising process"),
    "latents": InputParam("latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation"),
    "padding_mask_crop": InputParam("padding_mask_crop", type_hint=Optional[Tuple[int, int]], description="Size of margin in crop for image and mask"),
    "original_size": InputParam("original_size", type_hint=Optional[Tuple[int, int]], description="Original size of the image for SDXL's micro-conditioning"),
    "target_size": InputParam("target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning"),
    "negative_original_size": InputParam("negative_original_size", type_hint=Optional[Tuple[int, int]], description="Negative conditioning based on image resolution"),
    "negative_target_size": InputParam("negative_target_size", type_hint=Optional[Tuple[int, int]], description="Negative conditioning based on target resolution"),
    "crops_coords_top_left": InputParam("crops_coords_top_left", type_hint=Tuple[int, int], default=(0, 0), description="Top-left coordinates for SDXL's micro-conditioning"),
    "negative_crops_coords_top_left": InputParam("negative_crops_coords_top_left", type_hint=Tuple[int, int], default=(0, 0), description="Negative conditioning crop coordinates"),
    "aesthetic_score": InputParam("aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"),
    "negative_aesthetic_score": InputParam("negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"),
    "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
    "output_type": InputParam("output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"),
    "ip_adapter_image": InputParam("ip_adapter_image", type_hint=PipelineImageInput, required=True, description="Image(s) to be used as IP adapter"),
    "control_image": InputParam("control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition"),
    "control_guidance_start": InputParam("control_guidance_start", type_hint=Union[float, List[float]], default=0.0, description="When ControlNet starts applying"),
    "control_guidance_end": InputParam("control_guidance_end", type_hint=Union[float, List[float]], default=1.0, description="When ControlNet stops applying"),
    "controlnet_conditioning_scale": InputParam("controlnet_conditioning_scale", type_hint=Union[float, List[float]], default=1.0, description="Scale factor for ControlNet outputs"),
    "guess_mode": InputParam("guess_mode", type_hint=bool, default=False, description="Enables ControlNet encoder to recognize input without prompts"),
    "control_mode": InputParam("control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet")
 }
 SDXL_INTERMEDIATE_INPUTS_SCHEMA = {
    "prompt_embeds": InputParam("prompt_embeds", type_hint=torch.Tensor, required=True, description="Text embeddings used to guide image generation"),
    "negative_prompt_embeds": InputParam("negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"),
    "pooled_prompt_embeds": InputParam("pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings"),
    "negative_pooled_prompt_embeds": InputParam("negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"),
    "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"),
    "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
    "preprocess_kwargs": InputParam("preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor"),
    "latents": InputParam("latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process"),
    "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"),
    "num_inference_steps": InputParam("num_inference_steps", type_hint=int, required=True, description="Number of denoising steps"),
    "latent_timestep": InputParam("latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep"),
    "image_latents": InputParam("image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image"),
    "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"),
    "masked_image_latents": InputParam("masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"),
    "add_time_ids": InputParam("add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning"),
    "negative_add_time_ids": InputParam("negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"),
    "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
    "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
    "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
    "ip_adapter_embeds": InputParam("ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"),
    "negative_ip_adapter_embeds": InputParam("negative_ip_adapter_embeds", type_hint=List[torch.Tensor], description="Negative image embeddings for IP-Adapter"),
    "images": InputParam("images", type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], required=True, description="Generated images")
 }
 SDXL_INTERMEDIATE_OUTPUTS_SCHEMA = {
    "prompt_embeds": OutputParam("prompt_embeds", type_hint=torch.Tensor, description="Text embeddings used to guide image generation"),
    "negative_prompt_embeds": OutputParam("negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"),
    "pooled_prompt_embeds": OutputParam("pooled_prompt_embeds", type_hint=torch.Tensor, description="Pooled text embeddings"),
    "negative_pooled_prompt_embeds": OutputParam("negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"),
    "batch_size": OutputParam("batch_size", type_hint=int, description="Number of prompts"),
    "dtype": OutputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
    "image_latents": OutputParam("image_latents", type_hint=torch.Tensor, description="Latents representing reference image"),
    "mask": OutputParam("mask", type_hint=torch.Tensor, description="Mask for inpainting"),
    "masked_image_latents": OutputParam("masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"),
    "crops_coords": OutputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
    "timesteps": OutputParam("timesteps", type_hint=torch.Tensor, description="Timesteps for inference"),
    "num_inference_steps": OutputParam("num_inference_steps", type_hint=int, description="Number of denoising steps"),
    "latent_timestep": OutputParam("latent_timestep", type_hint=torch.Tensor, description="Initial noise level timestep"),
    "add_time_ids": OutputParam("add_time_ids", type_hint=torch.Tensor, description="Time ids for conditioning"),
    "negative_add_time_ids": OutputParam("negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"),
    "timestep_cond": OutputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
    "latents": OutputParam("latents", type_hint=torch.Tensor, description="Denoised latents"),
    "noise": OutputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
    "ip_adapter_embeds": OutputParam("ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"),
    "negative_ip_adapter_embeds": OutputParam("negative_ip_adapter_embeds", type_hint=List[torch.Tensor], description="Negative image embeddings for IP-Adapter"),
    "images": OutputParam("images", type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], description="Generated images")
 }
 SDXL_OUTPUTS_SCHEMA = {
    "images": OutputParam("images", type_hint=Union[Tuple[Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]]], StableDiffusionXLPipelineOutput], description="The final generated images")
 }
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline_presets.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline_presets.py
@@ -0,0 +1,43 @@
 # Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Any, List, Optional, Tuple, Union, Dict
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from .denoise import StableDiffusionXLAutoDenoiseStep
 from .before_denoise import StableDiffusionXLAutoBeforeDenoiseStep
 from .decoders import StableDiffusionXLAutoDecodeStep
 from .encoders import StableDiffusionXLTextEncoderStep, StableDiffusionXLAutoIPAdapterStep, StableDiffusionXLAutoVaeEncoderStep
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class StableDiffusionXLAutoPipeline(SequentialPipelineBlocks):
    block_classes = [StableDiffusionXLTextEncoderStep, StableDiffusionXLAutoIPAdapterStep, StableDiffusionXLAutoVaeEncoderStep, StableDiffusionXLAutoBeforeDenoiseStep, StableDiffusionXLAutoDenoiseStep, StableDiffusionXLAutoDecodeStep]
    block_names = ["text_encoder", "ip_adapter", "image_encoder", "before_denoise", "denoise", "decoder"]
    @property
    def description(self):
        return "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL.\n" + \
               "- for image-to-image generation, you need to provide either `image` or `image_latents`\n" + \
               "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + \
               "- to run the controlnet workflow, you need to provide `control_image`\n" + \
               "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n" + \
               "- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n" + \
               "- for text-to-image generation, all you need to provide is `prompt`"
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -246,14 +246,15 @@ def _get_connected_pipeline(pipeline_cls):
        return _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False)
-def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
+def _get_model(pipeline_class_name):
    def get_model(pipeline_class_name):
    for task_mapping in SUPPORTED_TASKS_MAPPINGS:
        for model_name, pipeline in task_mapping.items():
            if pipeline.__name__ == pipeline_class_name:
                return model_name
-    model_name = get_model(pipeline_class_name)
+
 def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
    model_name = _get_model(pipeline_class_name)
    if model_name is not None:
        task_class = mapping.get(model_name, None)
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -912,12 +912,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
            )
        latents_mean = latents_std = None
        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
        # Offload text encoder if `enable_model_cpu_offload` was enabled
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.text_encoder_2.to("cpu")
@@ -931,6 +925,11 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
            init_latents = image
        else:
            latents_mean = latents_std = None
            if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
                latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
            if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
                latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
            # make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                image = image.float()
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
@@ -867,12 +867,6 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
            )
        latents_mean = latents_std = None
        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
        # Offload text encoder if `enable_model_cpu_offload` was enabled
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.text_encoder_2.to("cpu")
@@ -886,6 +880,11 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
            init_latents = image
        else:
            latents_mean = latents_std = None
            if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
                latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
            if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
                latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
            # make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                image = image.float()
--- a/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
+++ b/src/diffusers/pipelines/kolors/pipeline_kolors_img2img.py
@@ -609,12 +609,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
            )
        latents_mean = latents_std = None
        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
        # Offload text encoder if `enable_model_cpu_offload` was enabled
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.text_encoder_2.to("cpu")
@@ -628,6 +622,11 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
            init_latents = image
        else:
            latents_mean = latents_std = None
            if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
                latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
            if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
                latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
            # make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                image = image.float()
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -917,12 +917,6 @@ class StableDiffusionXLControlNetPAGImg2ImgPipeline(
                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
            )
        latents_mean = latents_std = None
        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
        # Offload text encoder if `enable_model_cpu_offload` was enabled
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.text_encoder_2.to("cpu")
@@ -936,6 +930,11 @@ class StableDiffusionXLControlNetPAGImg2ImgPipeline(
            init_latents = image
        else:
            latents_mean = latents_std = None
            if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
                latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
            if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
                latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
            # make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                image = image.float()
--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
@@ -707,12 +707,6 @@ class StableDiffusionXLPAGImg2ImgPipeline(
                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
            )
        latents_mean = latents_std = None
        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
        # Offload text encoder if `enable_model_cpu_offload` was enabled
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.text_encoder_2.to("cpu")
@@ -726,6 +720,11 @@ class StableDiffusionXLPAGImg2ImgPipeline(
            init_latents = image
        else:
            latents_mean = latents_std = None
            if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
                latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
            if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
                latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
            # make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                image = image.float()
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -331,6 +331,20 @@ def maybe_raise_or_warn(
        )
 # a simpler version of get_class_obj_and_candidates, it won't work with custom code
 def simple_get_class_obj(library_name, class_name):
    from diffusers import pipelines
    is_pipeline_module = hasattr(pipelines, library_name)
    if is_pipeline_module:
        pipeline_module = getattr(pipelines, library_name)
        class_obj = getattr(pipeline_module, class_name)
    else:
        library = importlib.import_module(library_name)
        class_obj = getattr(library, class_name)
    return class_obj
 def get_class_obj_and_candidates(
    library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
 ):
@@ -839,6 +853,9 @@ def _fetch_class_library_tuple(module):
        library = not_compiled_module.__module__
    # retrieve class_name
    if isinstance(not_compiled_module, type):
        class_name = not_compiled_module.__name__
    else:
        class_name = not_compiled_module.__class__.__name__
    return (library, class_name)
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -427,7 +427,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            module_is_sequentially_offloaded(module) for _, module in self.components.items()
        )
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = hasattr(self, "hf_device_map") and self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
@@ -444,6 +444,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                    "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `bitsandbytes`. Your current `accelerate` installation does not support it. Please upgrade the installation."
                )
        # Display a warning in this case (the operation succeeds but the benefits are lost)
        pipeline_is_offloaded = any(module_is_offloaded(module) for _, module in self.components.items())
        if pipeline_is_offloaded and device_type in ["cuda", "xpu"]:
@@ -1119,9 +1120,11 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                automatically detect the available accelerator and use.
        """
        self._maybe_raise_error_if_group_offload_active(raise_error=True)
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = hasattr(self, "hf_device_map") and self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
@@ -1245,7 +1248,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
        self.remove_all_hooks()
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = hasattr(self, "hf_device_map") and self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
@@ -1945,9 +1948,10 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            f"{'' if k.startswith('_') else '_'}{k}": v for k, v in original_config.items() if k not in pipeline_kwargs
        }
        optional_components = pipeline._optional_components if hasattr(pipeline, "_optional_components") and pipeline._optional_components else []
        missing_modules = (
            set(expected_modules)
-            - set(pipeline._optional_components)
+            - set(optional_components)
            - set(pipeline_kwargs.keys())
            - set(true_optional_modules)
        )
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -695,12 +695,6 @@ class StableDiffusionXLImg2ImgPipeline(
                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
            )
        latents_mean = latents_std = None
        if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
        if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
            latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
        # Offload text encoder if `enable_model_cpu_offload` was enabled
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.text_encoder_2.to("cpu")
@@ -714,6 +708,11 @@ class StableDiffusionXLImg2ImgPipeline(
            init_latents = image
        else:
            latents_mean = latents_std = None
            if hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None:
                latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1)
            if hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None:
                latents_std = torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1)
            # make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                image = image.float()
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -1388,6 +1388,21 @@ class LDMSuperResolutionPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch"])
 class ModularLoader(metaclass=DummyObject):
    _backends = ["torch"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch"])
    @classmethod
    def from_config(cls, *args, **kwargs):
        requires_backends(cls, ["torch"])
    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        requires_backends(cls, ["torch"])
 class PNDMPipeline(metaclass=DummyObject):
    _backends = ["torch"]
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2432,6 +2432,21 @@ class StableDiffusionXLInstructPix2PixPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
 class StableDiffusionXLModularLoader(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch", "transformers"])
    @classmethod
    def from_config(cls, *args, **kwargs):
        requires_backends(cls, ["torch", "transformers"])
    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        requires_backends(cls, ["torch", "transformers"])
 class StableDiffusionXLPAGImg2ImgPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -15,13 +15,16 @@
 """Utilities to dynamically load objects from the Hub."""
 import importlib
 import signal
 import inspect
 import json
 import os
 import re
 import shutil
 import sys
 import threading
 from pathlib import Path
 from types import ModuleType
 from typing import Dict, Optional, Union
 from urllib import request
@@ -37,6 +40,8 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # See https://huggingface.co/datasets/diffusers/community-pipelines-mirror
 COMMUNITY_PIPELINES_MIRROR_ID = "diffusers/community-pipelines-mirror"
 TIME_OUT_REMOTE_CODE = int(os.getenv("DIFFUSERS_TIMEOUT_REMOTE_CODE", 15))
 _HF_REMOTE_CODE_LOCK = threading.Lock()
 def get_diffusers_versions():
@@ -154,15 +159,87 @@ def check_imports(filename):
    return get_relative_imports(filename)
-def get_class_in_module(class_name, module_path):
+def _raise_timeout_error(signum, frame):
    raise ValueError(
        "Loading this model requires you to execute custom code contained in the model repository on your local "
        "machine. Please set the option `trust_remote_code=True` to permit loading of this model."
    )
 def resolve_trust_remote_code(trust_remote_code, model_name, has_remote_code):
    if trust_remote_code is None:
        if has_remote_code and TIME_OUT_REMOTE_CODE > 0:
            prev_sig_handler = None
            try:
                prev_sig_handler = signal.signal(signal.SIGALRM, _raise_timeout_error)
                signal.alarm(TIME_OUT_REMOTE_CODE)
                while trust_remote_code is None:
                    answer = input(
                        f"The repository for {model_name} contains custom code which must be executed to correctly "
                        f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
                        f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
                        f"Do you wish to run the custom code? [y/N] "
                    )
                    if answer.lower() in ["yes", "y", "1"]:
                        trust_remote_code = True
                    elif answer.lower() in ["no", "n", "0", ""]:
                        trust_remote_code = False
                signal.alarm(0)
            except Exception:
                # OS which does not support signal.SIGALRM
                raise ValueError(
                    f"The repository for {model_name} contains custom code which must be executed to correctly "
                    f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
                    f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
                )
            finally:
                if prev_sig_handler is not None:
                    signal.signal(signal.SIGALRM, prev_sig_handler)
                    signal.alarm(0)
        elif has_remote_code:
            # For the CI which puts the timeout at 0
            _raise_timeout_error(None, None)
    if has_remote_code and not trust_remote_code:
        raise ValueError(
            f"Loading {model_name} requires you to execute the configuration file in that"
            " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
            " set the option `trust_remote_code=True` to remove this error."
        )
    return trust_remote_code
 def get_class_in_module(class_name, module_path, force_reload=False):
    """
    Import a module on the cache directory for modules and extract a class from it.
    """
-    module_path = module_path.replace(os.path.sep, ".")
+    name = os.path.normpath(module_path)
-    module = importlib.import_module(module_path)
+    if name.endswith(".py"):
        name = name[:-3]
    name = name.replace(os.path.sep, ".")
    module_file: Path = Path(HF_MODULES_CACHE) / module_path
    with _HF_REMOTE_CODE_LOCK:
        if force_reload:
            sys.modules.pop(name, None)
            importlib.invalidate_caches()
        cached_module: Optional[ModuleType] = sys.modules.get(name)
        module_spec = importlib.util.spec_from_file_location(name, location=module_file)
        module: ModuleType
        if cached_module is None:
            module = importlib.util.module_from_spec(module_spec)
            # insert it into sys.modules before any loading begins
            sys.modules[name] = module
        else:
            module = cached_module
        module_spec.loader.exec_module(module)
    if class_name is None:
        return find_pipeline_class(module)
    return getattr(module, class_name)
@@ -454,4 +531,4 @@ def get_class_from_dynamic_module(
        revision=revision,
        local_files_only=local_files_only,
    )
-    return get_class_in_module(class_name, final_module.replace(".py", ""))
+    return get_class_in_module(class_name, final_module)
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -90,6 +90,11 @@ def is_compiled_module(module) -> bool:
    return isinstance(module, torch._dynamo.eval_frame.OptimizedModule)
 def unwrap_module(module):
    """Unwraps a module if it was compiled with torch.compile()"""
    return module._orig_mod if is_compiled_module(module) else module
 def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.Tensor":
    """Fourier filter as introduced in FreeU (https://arxiv.org/abs/2309.11497).
Author	SHA1	Message	Date
yiyixuxu	d34c4e8caf	update the description of StableDiffusionXLDenoiseLoopWrapper	2025-06-20 07:38:21 +02:00
yiyixuxu	b46b7c8b31	add to method to modular loader, copied from DiffusionPipeline, not tested yet	2025-06-20 07:25:20 +02:00
yiyixuxu	fc9168f429	add block mappings to modular_diffusers.stable_diffusion_xl.__init__	2025-06-20 07:24:14 +02:00
yiyixuxu	31a31ca1c5	rename modular_pipeline_block_mappings.py to modular_block_mapping	2025-06-20 07:23:14 +02:00
yiyixuxu	8423652b35	updatee modular_pipeline.from_pretrained, modular_repo ->pretrained_model_name_or_path	2025-06-19 05:30:18 +02:00
yiyixuxu	de631947cc	up	2025-06-19 04:45:20 +02:00
yiyixuxu	58e9565719	update doc format for kwargs_type	2025-06-19 02:24:51 +02:00
yiyixuxu	cb6d5fed19	refator based on dhruv's feedbacks	2025-06-18 10:11:22 +02:00
yiyixuxu	f16e9c7807	add	2025-06-10 23:10:17 +02:00
yiyixuxu	87f63d424a	modular node!	2025-05-22 11:50:36 +02:00
yiyixuxu	29de29f02c	add node_utils	2025-05-21 22:31:10 +02:00
yiyixuxu	72e1b74638	solve merge conflict: manually add back the remote code change to modular_pipeline	2025-05-20 20:26:51 +02:00
yiyixuxu	3471f2fb75	merge part1	2025-05-20 18:53:04 +02:00
yiyixuxu	d136ae36c8	update input for loop blocks, do not need to include intermediate	2025-05-20 18:11:05 +02:00
yiyixuxu	1b89ac144c	prepare_latents_img2img pipeline method -> function, maybe do the same for others?	2025-05-20 18:10:06 +02:00
yiyixuxu	eb9415031a	add a to-do for modular loader	2025-05-20 18:08:28 +02:00
yiyixuxu	de6ab6b49d	fix import in block mapping	2025-05-20 18:07:58 +02:00
yiyixuxu	4968edc5dc	remove the duplicated components_manager file I forgot to deletee	2025-05-20 18:07:27 +02:00
Dhruv Nair	808dff09cb	[WIP] Modular Diffusers support custom code/pipeline blocks (#11539 ) * update * update	2025-05-20 15:12:51 +05:30
yiyixuxu	61dac3bbe4	up	2025-05-19 22:39:32 +02:00
yiyixuxu	73ab5725c2	update components manager	2025-05-18 19:09:01 +02:00
yiyixuxu	163341d3dd	refactor modular loader: 1. load only load (pretrained components only if not specific names) 2. update acceept create spec 3. move the updte _componeent_spec logic outside register_components to each method that create/update the component: __init__/update/load	2025-05-18 18:58:26 +02:00
yiyixuxu	d0fbf745e6	refactor component spec: replace create/create_from_pretrained/create_from_config to just create and load method	2025-05-18 18:52:12 +02:00
yiyixuxu	27c1158b23	add a to-do for guider cconfig mixin	2025-05-18 18:50:03 +02:00
yiyixuxu	96ce6744fe	after_denoise -> decoders	2025-05-15 00:45:45 +02:00
yiyixuxu	8ad14a52cb	make generator intermediates (it is mutable)	2025-05-13 23:25:56 +02:00
yiyixuxu	a7fb2d2a22	remove the output step	2025-05-13 22:15:54 +02:00
yiyixuxu	a0deefb606	fix more	2025-05-13 20:51:21 +02:00
yiyixuxu	e2491af650	fix import	2025-05-13 20:42:57 +02:00
yiyixuxu	506a8ea09c	fix imports	2025-05-13 04:36:06 +02:00
yiyixuxu	58358c2d00	decode block, if skip decoding do not need to update latent	2025-05-13 01:57:47 +02:00
yiyixuxu	5cde77f915	make inputs truly immutable, remove the output logic in sequential pipeline, and update so that intermediates_outputs are only new variables	2025-05-13 01:52:51 +02:00
yiyixuxu	522e827625	move block mappings to its own file	2025-05-12 01:17:45 +02:00
yiyixuxu	144eae4e0b	add block state will also make sure modifed intermediates_inputs will be updated	2025-05-12 01:16:42 +02:00
yiyixuxu	796453cad1	add notes	2025-05-12 01:14:43 +02:00
yiyixuxu	153ae34ff6	update __init__	2025-05-10 03:50:47 +02:00
yiyixuxu	0acb5e1460	made a modular_pipelines folder!	2025-05-10 03:50:31 +02:00
yiyixuxu	462429b687	remove modular reelated change from pipelines folder	2025-05-10 03:50:10 +02:00
yiyixuxu	cf01aaeb49	update imports on guiders	2025-05-10 03:49:30 +02:00
yiyixuxu	2017ae5624	fix auto denoise so all tests pass	2025-05-09 08:19:24 +02:00
yiyixuxu	2b361a2413	fix get_execusion blocks with loopsequential	2025-05-09 08:17:10 +02:00
yiyixuxu	c677d528e4	change warning to debug	2025-05-09 08:16:24 +02:00
yiyixuxu	0f0618ff2b	refactor the denoiseestep using LoopSequential! also add a new file for denoise step	2025-05-08 11:28:52 +02:00
yiyixuxu	d89631fc50	update input formating, consider kwarggs_type inputs with no name, e/g *_controlnet_kwargs	2025-05-08 11:27:17 +02:00
yiyixuxu	16b6583fa8	allow input_fields as input & update message	2025-05-08 11:25:31 +02:00
yiyixuxu	f552773572	remove controlnet union denoise step, refactor & reuse controlnet denoisee step to accept aditional contrlnet kwargs	2025-05-06 10:00:14 +02:00
yiyixuxu	dc4dbfe107	reefactor pipeline/block states so that it can dynamically accept kwargs	2025-05-06 09:58:44 +02:00
yiyixuxu	43ac1ff7e7	refactor controlnet union	2025-05-04 22:17:25 +02:00
yiyixuxu	efd70b7838	seperate controlnet step into input + denoise	2025-05-03 20:22:05 +02:00
yiyixuxu	7ca860c24b	rename pipeline -> components, data -> block_state	2025-05-03 01:32:59 +02:00
yiyixuxu	7b86fcea31	remove lora step and ip-adapter step -> no longer needed	2025-05-02 11:31:25 +02:00
yiyixuxu	c8b5d56412	make loader optional	2025-05-02 00:46:31 +02:00
YiYi Xu	ce642e92da	Merge branch 'modular-diffusers' into modular-refactor	2025-04-30 17:56:51 -10:00
YiYi Xu	6a509ba862	Merge branch 'main' into modular-diffusers	2025-04-30 17:56:25 -10:00
YiYi Xu	6d5beefe29	[modular diffusers] introducing ModularLoader (#11462 ) * cfg; slg; pag; sdxl without controlnet --------- Co-authored-by: Aryan <aryan@huggingface.co>	2025-04-30 11:17:20 -10:00
Aryan	b863bdd6ca	Modular Diffusers Guiders (#11311 ) * cfg; slg; pag; sdxl without controlnet * support sdxl controlnet * support controlnet union * update * update * cfg zero* * use unwrap_module for torch compiled modules * remove guider kwargs * remove commented code * remove old guider * fix slg bug * remove debug print * autoguidance * smoothed energy guidance * add note about seg * tangential cfg * cfg plus plus * support cfgpp in ddim * apply review suggestions * refactor * rename enable/disable * remove cfg++ for now * rename do_classifier_free_guidance->prepare_unconditional_embeds * remove unused	2025-04-26 03:42:42 +05:30
yiyixuxu	d143851309	move methods to blocks	2025-04-12 11:46:25 +02:00
yiyixuxu	9ad1470d48	up	2025-04-11 18:29:21 +02:00
yiyixuxu	bf99ab2f55	up	2025-04-09 20:36:45 +02:00
yiyixuxu	ee842839ef	add componentspec and configspec	2025-04-09 01:40:02 +02:00
YiYi Xu	96795afc72	Merge branch 'main' into modular-diffusers	2025-04-07 18:05:00 -10:00
yiyixuxu	12650e1393	up	2025-02-04 02:08:28 +01:00
yiyixuxu	addaad013c	more more more refactor	2025-02-03 20:36:05 +01:00
yiyixuxu	485f8d1758	more refactor	2025-02-01 21:30:05 +01:00
yiyixuxu	cff0fd6260	more refactor	2025-02-01 11:36:13 +01:00
yiyixuxu	8ddb20bfb8	up	2025-02-01 05:45:00 +01:00
yiyixuxu	e5089d702b	update	2025-01-31 21:55:45 +01:00
yiyixuxu	2c3e4eafa8	fix	2025-01-29 17:58:40 +01:00
yiyixuxu	c7020df2cf	add model_info	2025-01-27 11:33:27 +01:00
yiyixuxu	4bed3e306e	up up	2025-01-26 13:04:33 +01:00
yiyixuxu	00a3bc9d6c	fix	2025-01-23 18:16:00 +01:00
YiYi Xu	ccb35acd81	Merge branch 'main' into modular-diffusers	2025-01-23 07:07:11 -10:00
yiyixuxu	00cae4e857	docstring doc doc doc	2025-01-23 11:07:13 +01:00
yiyixuxu	b3fb4188f5	Merge branch 'modular-diffusers' of github.com:huggingface/diffusers into modular-diffusers	2025-01-22 17:24:06 +01:00
YiYi Xu	71df1581f7	Update src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_modular.py Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>	2025-01-22 06:19:22 -10:00
yiyixuxu	d046cf7d35	block state + fix for num_images_per_prompt > 1 for denoise/controlnet union etc	2025-01-22 09:48:57 +01:00
yiyixuxu	68a5185c86	refactor more, ipadapter node, lora node	2025-01-20 03:36:01 +01:00
yiyixuxu	6e2fe26bfd	fix more for lora	2025-01-18 08:04:12 +01:00
yiyixuxu	77b5fa59c5	make it work with lora has both text_encoder & unet	2025-01-18 04:12:07 +01:00
yiyixuxu	a226920b52	get_block_state make it less verbose	2025-01-17 01:37:18 +01:00
yiyixuxu	7007f72409	InputParam, OutputParam, get_auto_doc	2025-01-16 11:44:24 +01:00
yiyixuxu	a6804de4a2	add controlnet union to auto & fix for pag	2025-01-12 16:24:01 +01:00
yiyixuxu	7f897a9fc4	fix	2025-01-12 04:50:45 +01:00
yiyixuxu	0966663d2a	adjust print	2025-01-11 19:15:54 +01:00
yiyixuxu	fb78f4f12d	Merge branch 'modular-diffusers' of github.com:huggingface/diffusers into modular-diffusers	2025-01-11 09:05:56 +01:00
yiyixuxu	2220af6940	refactor	2025-01-11 09:05:47 +01:00
hlky	7a34832d52	[modular] Stable Diffusion XL ControlNet Union (#10509 ) StableDiffusionXLControlNetUnionDenoiseStep	2025-01-09 10:29:45 -10:00
yiyixuxu	e973de64f9	fix contro;net inpaint preprocess	2025-01-08 21:47:20 +01:00
yiyixuxu	db94ca882d	add controlnet inpaint + more refactor	2025-01-07 20:49:58 +01:00
yiyixuxu	6985906a2e	controlnet input & remove the MultiPipelineBlocks class	2025-01-07 01:56:33 +01:00
yiyixuxu	54f410db6c	add inpaint	2025-01-06 09:19:59 +01:00
yiyixuxu	c12a05b9c1	update to to not assume pipeline has hf_device_map	2025-01-03 20:57:44 +01:00
yiyixuxu	2e0f5c86cc	start to add inpaint	2025-01-03 18:20:39 +01:00
yiyixuxu	1d63306295	make it work with lora	2025-01-03 06:07:25 +01:00
yiyixuxu	6c93626f6f	remove run_blocks, just use __call__	2025-01-02 00:59:12 +01:00
yiyixuxu	72c5bf07c8	add a from_block class method to modular pipeline	2025-01-02 00:49:34 +01:00
yiyixuxu	ed59f90f15	modular pipeline builder -> ModularPipeline	2025-01-01 22:15:48 +01:00
yiyixuxu	a09ca7f27e	refactors: block __init__ no longer accept args. remove update_states from pipeline blocks, add update_states to modularpipeline, remove multi-block support for modular pipeline, remove offload support on modular pipeline	2025-01-01 21:43:20 +01:00
yiyixuxu	8c02572e16	add memory_reserve_margin arg to auto offload	2024-12-31 20:08:53 +01:00
yiyixuxu	27dde51de8	add output arg to run_blocks	2024-12-31 18:06:44 +01:00
yiyixuxu	10d4a775f1	style	2024-12-31 09:55:50 +01:00
yiyixuxu	72d9a81d99	components manager	2024-12-31 09:54:46 +01:00
yiyixuxu	4fa85c7963	add model_manager and global offloading method	2024-12-31 02:57:42 +01:00
YiYi Xu	806e8e66fb	Merge branch 'main' into modular-diffusers	2024-12-29 00:44:43 -10:00
yiyixuxu	0b90051db8	add vae encoder node	2024-12-19 17:57:12 +01:00
yiyixuxu	b305c779b2	add offload support!	2024-12-14 21:37:21 +01:00
yiyixuxu	2b3cd2d39c	update	2024-12-14 03:02:31 +01:00
yiyixuxu	bc3d1c9ee6	add model_cpu_offload_seq + _exlude_from_cpu_offload	2024-12-14 00:24:15 +01:00
yiyixuxu	e50d614636	only add model as expected_component when the model need to run for the block, currently it's added even when only config is needed	2024-12-11 03:39:39 +01:00
hlky	a8df0f1ffb	Modular APG (#10173 )	2024-12-10 08:22:42 -10:00
yiyixuxu	ace53e2d2f	update/refactor	2024-12-10 03:41:28 +01:00
yiyixuxu	ffc2992fc2	add autostep (not complete)	2024-11-16 22:42:06 +01:00
yiyixuxu	c70a285c2c	style	2024-10-30 10:33:25 +01:00
yiyixuxu	8b811feece	refactor, from_pretrained, from_pipe, remove_blocks, replace_blocks	2024-10-30 10:13:03 +01:00
yiyixuxu	37e8dc7a59	remove img2img blocksgit status consolidate text2img and img2img	2024-10-28 00:37:48 +01:00
yiyixuxu	024a9f5de3	fix so that run_blocks can work with inputs in the state	2024-10-27 18:52:56 +01:00
yiyixuxu	005195c23e	add	2024-10-27 15:18:10 +01:00
yiyixuxu	6742f160df	up	2024-10-27 14:59:31 +01:00
yiyixuxu	540d303250	refactor guider	2024-10-26 21:17:06 +02:00
yiyixuxu	f1b3036ca1	update pag guider - draft	2024-10-24 00:14:59 +02:00
yiyixuxu	46ec1743a2	refactor guider, remove prepareguidance step to be combinedd into denoisestep	2024-10-23 21:42:40 +02:00
yiyixuxu	70272b1108	combine controlnetstep into contronetdesnoisestep	2024-10-20 19:45:00 +02:00
yiyixuxu	2b6dcbfa1d	fix controlnet	2024-10-20 19:23:37 +02:00
yiyixuxu	af9572d759	controlnet	2024-10-19 12:36:12 +02:00
yiyixuxu	ddea157979	add from_pipe + run_blocks	2024-10-17 20:02:36 +02:00
yiyixuxu	ad3f9a26c0	update img2img, result match	2024-10-17 05:47:15 +02:00
yiyixuxu	e8d0980f9f	add img2img support - output does not match with non-modular pipeline completely yet (look into later)	2024-10-16 20:56:39 +02:00
yiyixuxu	52a7f1cb97	add dataflow info for each block in builder _repr_	2024-10-16 09:04:32 +02:00
yiyixuxu	33f85fadf6	add	2024-10-14 19:16:23 +02:00