Release: v0.16.1

Allow disabling torch 2_0 attention (#3273 )
* Allow disabling torch 2_0 attention * make style * Update src/diffusers/models/attention.py
2025-12-06 20:44:33 +08:00 · 2023-04-28 14:03:50 +02:00 · 2023-04-28 14:01:41 +02:00 · 2023-04-28 13:16:57 +02:00 · 2023-04-28 13:16:33 +02:00
12 changed files with 30 additions and 30 deletions
--- a/docs/source/en/api/pipelines/if.mdx
+++ b/docs/source/en/api/pipelines/if.mdx
@@ -28,8 +28,8 @@ Our work underscores the potential of larger UNet architectures in the first sta
 ## Usage

 Before you can use IF, you need to accept its usage conditions. To do so:
-1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be loggin in
-2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) and [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0)
+1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be logged in
+2. Accept the license on the model card of [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). Accepting the license on the stage I model card will auto accept for the other IF models.
 3. Make sure to login locally. Install `huggingface_hub`
 ```sh
 pip install huggingface_hub --upgrade
@@ -62,7 +62,7 @@ The following sections give more in-detail examples of how to use IF. Specifical

 **Available checkpoints**
 - *Stage-1*
-  - [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0)
+  - [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0)
  - [DeepFloyd/IF-I-L-v1.0](https://huggingface.co/DeepFloyd/IF-I-L-v1.0)
  - [DeepFloyd/IF-I-M-v1.0](https://huggingface.co/DeepFloyd/IF-I-M-v1.0)

@@ -90,7 +90,7 @@ from diffusers.utils import pt_to_pil
 import torch

 # stage 1
-stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 stage_1.enable_model_cpu_offload()

 # stage 2
@@ -162,7 +162,7 @@ original_image = Image.open(BytesIO(response.content)).convert("RGB")
 original_image = original_image.resize((768, 512))

 # stage 1
-stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 stage_1.enable_model_cpu_offload()

 # stage 2
@@ -244,7 +244,7 @@ mask_image = Image.open(BytesIO(response.content))
 mask_image = mask_image

 # stage 1
-stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 stage_1.enable_model_cpu_offload()

 # stage 2
@@ -305,7 +305,7 @@ In addition to being loaded with `from_pretrained`, Pipelines can also be loaded
 ```python
 from diffusers import IFPipeline, IFSuperResolutionPipeline

-pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0")
+pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0")
 pipe_2 = IFSuperResolutionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0")


@@ -326,7 +326,7 @@ pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components)
 The simplest optimization to run IF faster is to move all model components to the GPU.

 ```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.to("cuda")
 ```

@@ -352,7 +352,7 @@ the input image which also determines how many steps to run in the denoising pro
 A smaller number will vary the image less but run faster.

 ```py
-pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.to("cuda")

 image = pipe(image=image, prompt="<prompt>", strength=0.3).images
@@ -364,7 +364,7 @@ with IF and it might not give expected results.
 ```py
 import torch

-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.to("cuda")

 pipe.text_encoder = torch.compile(pipe.text_encoder)
@@ -378,14 +378,14 @@ When optimizing for GPU memory, we can use the standard diffusers cpu offloading
 Either the model based CPU offloading,

 ```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.enable_model_cpu_offload()
 ```

 or the more aggressive layer based CPU offloading.

 ```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.enable_sequential_cpu_offload()
 ```

@@ -395,13 +395,13 @@ Additionally, T5 can be loaded in 8bit precision
 from transformers import T5EncoderModel

 text_encoder = T5EncoderModel.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+    "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
 )

 from diffusers import DiffusionPipeline

 pipe = DiffusionPipeline.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0",
+    "DeepFloyd/IF-I-XL-v1.0",
    text_encoder=text_encoder,  # pass the previously instantiated 8bit text encoder
    unet=None,
    device_map="auto",
@@ -422,13 +422,13 @@ from transformers import T5EncoderModel
 from diffusers.utils import pt_to_pil

 text_encoder = T5EncoderModel.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+    "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
 )

 # text to image

 pipe = DiffusionPipeline.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0",
+    "DeepFloyd/IF-I-XL-v1.0",
    text_encoder=text_encoder,  # pass the previously instantiated 8bit text encoder
    unet=None,
    device_map="auto",
@@ -444,7 +444,7 @@ gc.collect()
 torch.cuda.empty_cache()

 pipe = IFPipeline.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
+    "DeepFloyd/IF-I-XL-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
 )

 generator = torch.Generator().manual_seed(0)
--- a/setup.py
+++ b/setup.py
@@ -226,7 +226,7 @@ install_requires = [

 setup(
    name="diffusers",
-    version="0.16.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.16.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="Diffusers",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -1,4 +1,4 @@
-__version__ = "0.16.0"
+__version__ = "0.16.1"

 from .configuration_utils import ConfigMixin
 from .utils import (
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -71,6 +71,7 @@ class AttentionBlock(nn.Module):
        self.proj_attn = nn.Linear(channels, channels, bias=True)

        self._use_memory_efficient_attention_xformers = False
+        self._use_2_0_attn = True
        self._attention_op = None

    def reshape_heads_to_batch_dim(self, tensor, merge_head_and_batch=True):
@@ -142,9 +143,8 @@ class AttentionBlock(nn.Module):

        scale = 1 / math.sqrt(self.channels / self.num_heads)

-        use_torch_2_0_attn = (
-            hasattr(F, "scaled_dot_product_attention") and not self._use_memory_efficient_attention_xformers
-        )
+        _use_2_0_attn = self._use_2_0_attn and not self._use_memory_efficient_attention_xformers
+        use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention") and _use_2_0_attn

        query_proj = self.reshape_heads_to_batch_dim(query_proj, merge_head_and_batch=not use_torch_2_0_attn)
        key_proj = self.reshape_heads_to_batch_dim(key_proj, merge_head_and_batch=not use_torch_2_0_attn)
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -41,7 +41,7 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers.utils import pt_to_pil
        >>> import torch

-        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -70,7 +70,7 @@ EXAMPLE_DOC_STRING = """
        >>> original_image = original_image.resize((768, 512))

        >>> pipe = IFImg2ImgPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0",
+        ...     "DeepFloyd/IF-I-XL-v1.0",
        ...     variant="fp16",
        ...     torch_dtype=torch.float16,
        ... )
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -73,7 +73,7 @@ EXAMPLE_DOC_STRING = """
        >>> original_image = original_image.resize((768, 512))

        >>> pipe = IFImg2ImgPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0",
+        ...     "DeepFloyd/IF-I-XL-v1.0",
        ...     variant="fp16",
        ...     torch_dtype=torch.float16,
        ... )
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -76,7 +76,7 @@ EXAMPLE_DOC_STRING = """
        >>> mask_image = mask_image

        >>> pipe = IFInpaintingPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -78,7 +78,7 @@ EXAMPLE_DOC_STRING = """
        >>> mask_image = mask_image

        >>> pipe = IFInpaintingPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -45,7 +45,7 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers.utils import pt_to_pil
        >>> import torch

-        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -267,7 +267,7 @@ def get_cached_module_file(

        # retrieve github version that matches
        if revision is None:
-            revision = latest_version if latest_version in available_versions else "main"
+            revision = latest_version if latest_version[1:] in available_versions else "main"
            logger.info(f"Defaulting to latest_version: {revision}.")
        elif revision in available_versions:
            revision = f"v{revision}"
--- a/tests/pipelines/deepfloyd_if/test_if.py
+++ b/tests/pipelines/deepfloyd_if/test_if.py
@@ -94,7 +94,7 @@ class IFPipelineSlowTests(unittest.TestCase):
    def test_all(self):
        # if

-        pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)

        pipe_2 = IFSuperResolutionPipeline.from_pretrained(
            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16, text_encoder=None, tokenizer=None
Author	SHA1	Message	Date
Patrick von Platen	9b14ce397e	Release: v0.16.1	2023-04-28 14:03:50 +02:00
Patrick von Platen	23159f4adb	Allow disabling torch 2_0 attention (#3273 ) * Allow disabling torch 2_0 attention * make style * Update src/diffusers/models/attention.py	2023-04-28 14:01:41 +02:00
Patrick von Platen	4c476e99b5	Fix community pipelines (#3266 )	2023-04-28 13:16:57 +02:00
apolinário	9c876a5915	merge conflict	2023-04-28 13:16:33 +02:00