[WIP] Force unmasked area to not change

2025-12-06 20:44:33 +08:00 · 2023-06-28 15:47:05 +00:00
1 changed files with 25 additions and 13 deletions
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -650,12 +650,13 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi
                "However, either the image or the noise timestep has not been provided."
            )

+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
        if return_image_latents or (latents is None and not is_strength_max):
            image = image.to(device=device, dtype=dtype)
            image_latents = self._encode_vae_image(image=image, generator=generator)

        if latents is None:
-            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
            # if strength is 1. then initialise the latents to noise, else initial to image + noise
            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
            # if pure noise then scale the initial latents by the  Scheduler's init sigma
@@ -762,6 +763,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        force_unmasked_unchanged: Optional[bool] = None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -944,7 +946,16 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi
        # 6. Prepare latent variables
        num_channels_latents = self.vae.config.latent_channels
        num_channels_unet = self.unet.config.in_channels
-        return_image_latents = num_channels_unet == 4
+
+        return_image_latents = num_channels_unet == 4 or force_unmasked_unchanged
+        if num_channels_unet == 4 and force_unmasked_unchanged is False:
+            raise ValueError("Cannot set `force_unmasked_unchanged=False` for inpainting if the UNet has only 4 input channels. Either set `force_unmasked_unchanged` to `True` or use a UNet checkpoint that has been trained specifically for inpainting.")
+        elif num_channels_unet == 4 and force_unmasked_unchanged is None:
+            # For checkpoints not trained for inpainting the unmasked area should be default not be changed
+            force_unmasked_unchanged = True
+        elif num_channels_unet > 4 and force_unmasked_unchanged is None:
+            # For checkpoints trained for inpainting the unmasked area should be default be changed to ensure more 
+            force_unmasked_unchanged = False

        latents_outputs = self.prepare_latents(
            batch_size * num_images_per_prompt,
@@ -959,10 +970,10 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi
            timestep=latent_timestep,
            is_strength_max=is_strength_max,
            return_noise=True,
-            return_image_latents=return_image_latents,
+            return_image_latents=force_unmasked_unchanged,
        )

-        if return_image_latents:
+        if force_unmasked_unchanged:
            latents, noise, image_latents = latents_outputs
        else:
            latents, noise = latents_outputs
@@ -1033,17 +1044,18 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi
                # compute the previous noisy sample x_t -> x_t-1
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

-                if num_channels_unet == 4:
-                    init_latents_proper = image_latents[:1]
-                    init_mask = mask[:1]

-                    if i < len(timesteps) - 1:
-                        noise_timestep = timesteps[i + 1]
-                        init_latents_proper = self.scheduler.add_noise(
-                            init_latents_proper, noise, torch.tensor([noise_timestep])
-                        )
+                if num_channels_unet == 4 and i < len(timesteps) - 1:
+                    # add noise for next timestep
+                    noise_timestep = timesteps[i + 1]

-                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+                    init_latents_proper = self.scheduler.add_noise(
+                        image_latents[:1], noise, torch.tensor([noise_timestep])
+                    )
+                    latents = (1 - mask[:1]) * init_latents_proper + mask[:1] * latents
+
+                if force_unmasked_unchanged and i == (len(timesteps) - 1):
+                    latents = (1 - mask[:1]) * image_latents[:1] + mask[:1] * latents

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):