style

Apply suggestions from code review
Merge branch 'main' into fix_unet_2d_condition_attn_naming
2025-12-06 20:44:33 +08:00 · 2024-04-05 16:26:38 +00:00 · 2024-04-04 15:55:12 -10:00 · 2024-04-04 15:44:54 -10:00 · 2024-04-04 23:08:34 +00:00 · 2024-04-04 18:11:17 +00:00
2 changed files with 155 additions and 113 deletions
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -626,6 +626,51 @@ def register_to_config(init):
                "not inherit from `ConfigMixin`."
            )

+        # deprecate `attention_head_dim`
+        def maybe_correct_attention_head_dim(attention_head_dim):
+            down_block_types = init_kwargs.get("down_block_types", None)
+            up_block_types = init_kwargs.get("up_block_types", None)
+            mid_block_type = init_kwargs.get("mid_block_type", None)
+            block_out_channels = init_kwargs.get("block_out_channels", None)
+
+            if (
+                "CrossAttnDownBlock2D" in down_block_types
+                or "CrossAttnUpBlock2D" in up_block_types
+                or mid_block_type == "UNetMidBlock2DCrossAttn"
+            ):
+                incorrect_attention_head_dim_name = True
+            else:
+                incorrect_attention_head_dim_name = False
+
+            if incorrect_attention_head_dim_name:
+                num_attention_heads = attention_head_dim
+            else:
+                # we use attention_head_dim to calculate num_attention_heads
+                if isinstance(attention_head_dim, int):
+                    num_attention_heads = [out_channels // attention_head_dim for out_channels in block_out_channels]
+                else:
+                    num_attention_heads = [
+                        out_channels // attn_dim
+                        for out_channels, attn_dim in zip(block_out_channels, attention_head_dim)
+                    ]
+            return num_attention_heads
+
+        if self.__class__.__name__ == "UNet2DConditionModel":
+            # As of now it is not possible to define a UNet2DConditionModel via `attention_head_dim`
+            # If `attention_head_dim` is defined we simply transfer the value to the "correct"
+            # `num_attention_heads` config name (see: https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131)
+            attention_head_dim = init_kwargs.pop("attention_head_dim", None)
+            num_attention_heads = init_kwargs.get("num_attention_heads", None)
+            if attention_head_dim is not None:
+                deprecation_message = " `attention_head_dim` is deprecated and will be removed in a future version. Use `num_attention_heads` instead."
+                deprecate("attention_head_dim not None", "1.0.0", deprecation_message, standard_warn=False)
+
+                num_attention_heads = maybe_correct_attention_head_dim(attention_head_dim)
+                logger.info(
+                    f"Changing `attention_head_dim = {attention_head_dim}` to `num_attention_heads = {num_attention_heads}`..."
+                )
+                init_kwargs["num_attention_heads"] = num_attention_heads
+
        ignore = getattr(self, "ignore_for_config", [])
        # Get positional arguments aligned with kwargs
        new_kwargs = {}
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -68,96 +68,102 @@ class UNet2DConditionOutput(BaseOutput):

 class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
    r"""
-    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
-    shaped output.
+        A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+        shaped output.

-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
+        This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods
+        implemented for all models (such as downloading or saving).

-    Parameters:
-        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
-            Height and width of input/output sample.
-        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
-        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
-        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
-            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
-            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
-            The tuple of upsample blocks to use.
-        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
-            Whether to include self-attention in the basic transformer blocks, see
-            [`~models.attention.BasicTransformerBlock`].
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
-        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
-            If `None`, normalization and activation layers is skipped in post-processing.
-        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
-            The dimension of the cross attention features.
-        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
-            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        encoder_hid_dim (`int`, *optional*, defaults to None):
-            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
-            dimension to `cross_attention_dim`.
-        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
-            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
-            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
-        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
-        num_attention_heads (`int`, *optional*):
-            The number of attention heads. If not defined, defaults to `attention_head_dim`
-        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
-            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
-        class_embed_type (`str`, *optional*, defaults to `None`):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        addition_embed_type (`str`, *optional*, defaults to `None`):
-            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
-            "text". "text" will use the `TextTimeEmbedding` layer.
-        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
-            Dimension for the timestep embeddings.
-        num_class_embeds (`int`, *optional*, defaults to `None`):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        time_embedding_type (`str`, *optional*, defaults to `positional`):
-            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
-        time_embedding_dim (`int`, *optional*, defaults to `None`):
-            An optional override for the dimension of the projected time embedding.
-        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
-            Optional activation function to use only once on the time embeddings before they are passed to the rest of
-            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
-        timestep_post_act (`str`, *optional*, defaults to `None`):
-            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
-        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
-            The dimension of `cond_proj` layer in the timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
-        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
-            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
-        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
-            embeddings with the class embeddings.
-        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
-            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
-            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
-            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
-            otherwise.
+        Parameters:
+            sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+                Height and width of input/output sample.
+            in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+            out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+            center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+            flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+                Whether to flip the sin to cos in the time embedding.
+            freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+            down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+                The tuple of downsample blocks to use.
+            mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+                Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+                `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+            up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+                The tuple of upsample blocks to use.
+            only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+                Whether to include self-attention in the basic transformer blocks, see
+                [`~models.attention.BasicTransformerBlock`].
+            block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+                The tuple of output channels for each block.
+            layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+            downsample_padding (`int`, *optional*, defaults to 1):
+                The padding to use for the downsampling convolution.
+            mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+            dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+            act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+            norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+                If `None`, normalization and activation layers is skipped in post-processing.
+            norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+            cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+                The dimension of the cross attention features.
+            transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+                The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+                [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+                [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+            reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+                The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+                blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+                [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+                [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+            encoder_hid_dim (`int`, *optional*, defaults to None):
+                If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+                dimension to `cross_attention_dim`.
+            encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+                If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+                embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+            attention_head_dim (`int`, *optional*):
+                The dimension of the attention heads. Note that this configuration parameter was previously incorrectly
+                named as stated in (https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+    ) and therefore will be automatically renamed to `num_attention_heads` if `num_attention_heads`
+                is not provided. If `num_attention_heads` is provided this configuration parameter will be ignored for
+                now.
+            num_attention_heads (`int`, *optional*, defaults to 8):
+                The number of attention heads.
+            resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+                for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+            class_embed_type (`str`, *optional*, defaults to `None`):
+                The type of class embedding to use which is ultimately summed with the time embeddings. Choose from
+                `None`, `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+            addition_embed_type (`str`, *optional*, defaults to `None`):
+                Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+                "text". "text" will use the `TextTimeEmbedding` layer.
+            addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+                Dimension for the timestep embeddings.
+            num_class_embeds (`int`, *optional*, defaults to `None`):
+                Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+                class conditioning with `class_embed_type` equal to `None`.
+            time_embedding_type (`str`, *optional*, defaults to `positional`):
+                The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+            time_embedding_dim (`int`, *optional*, defaults to `None`):
+                An optional override for the dimension of the projected time embedding.
+            time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+                Optional activation function to use only once on the time embeddings before they are passed to the rest
+                of the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+            timestep_post_act (`str`, *optional*, defaults to `None`):
+                The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+            time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+                The dimension of `cond_proj` layer in the timestep embedding.
+            conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel
+            (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+            projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+                `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+            class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+                embeddings with the class embeddings.
+            mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+                Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+                `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+                `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to
+                `False` otherwise.
    """

    _supports_gradient_checkpointing = True
@@ -193,8 +199,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
        encoder_hid_dim: Optional[int] = None,
        encoder_hid_dim_type: Optional[str] = None,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        num_attention_heads: Union[int, Tuple[int]] = 8,
+        attention_head_dim: Optional[Union[int, Tuple[int]]] = None,
        dual_cross_attention: bool = False,
        use_linear_projection: bool = False,
        class_embed_type: Optional[str] = None,
@@ -223,18 +229,15 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,

        self.sample_size = sample_size

-        if num_attention_heads is not None:
-            raise ValueError(
-                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
-            )
-
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # If `num_attention_heads` is not defined (which is the case for most models) we will rename `attention_head_dim` to `num_attention_heads`
        # The reason for this behavior is to correct for incorrectly named variables that were introduced
        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too quite difficult so the naming is corrected inside the `register_to_config`
+        # function for now.
+        if attention_head_dim is not None:
+            raise ValueError(
+                "It is not yet possible to define the attention head dim via `attention_head_dim` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `attention_head_dim` alongside will only be supported in diffusers v2.0.0"
+            )

        # Check inputs
        self._check_config(
@@ -246,7 +249,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            cross_attention_dim=cross_attention_dim,
            transformer_layers_per_block=transformer_layers_per_block,
            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
-            attention_head_dim=attention_head_dim,
            num_attention_heads=num_attention_heads,
        )

@@ -309,6 +311,13 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        self.down_blocks = nn.ModuleList([])
        self.up_blocks = nn.ModuleList([])

+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        attention_head_dim = [
+            out_channels // num_heads for out_channels, num_heads in zip(block_out_channels, num_attention_heads)
+        ]
+
        if isinstance(only_cross_attention, bool):
            if mid_block_only_cross_attention is None:
                mid_block_only_cross_attention = only_cross_attention
@@ -318,12 +327,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        if mid_block_only_cross_attention is None:
            mid_block_only_cross_attention = False

-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(down_block_types)
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
        if isinstance(cross_attention_dim, int):
            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)

@@ -489,7 +492,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        cross_attention_dim: Union[int, Tuple[int]],
        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
        reverse_transformer_layers_per_block: bool,
-        attention_head_dim: int,
        num_attention_heads: Optional[Union[int, Tuple[int]]],
    ):
        if len(down_block_types) != len(up_block_types):
@@ -512,11 +514,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
            )

-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
-            )
-
        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
            raise ValueError(
                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
Author	SHA1	Message	Date
yiyixu	a2c5ea50bb	style	2024-04-05 16:26:38 +00:00
YiYi Xu	70c55aea58	Apply suggestions from code review	2024-04-04 15:55:12 -10:00
YiYi Xu	020660152e	Merge branch 'main' into fix_unet_2d_condition_attn_naming	2024-04-04 15:44:54 -10:00
yiyixu	d0f7b8200a	up	2024-04-04 23:08:34 +00:00
yiyixu	568b983df6	update	2024-04-04 18:11:17 +00:00
Patrick von Platen	32a71b35ae	fix naming	2024-02-19 11:42:56 +00:00