mirror of
https://github.com/huggingface/diffusers.git
synced 2026-04-21 23:21:24 +08:00
Compare commits
7 Commits
hires-upsa
...
improve_co
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1410a1bcdc | ||
|
|
a9109dbb2b | ||
|
|
6874d2b57f | ||
|
|
d8012a4825 | ||
|
|
0e9416d6a3 | ||
|
|
03dfb7f0b4 | ||
|
|
fe0a0ebe88 |
@@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -49,8 +49,6 @@
|
||||
title: "OpenVINO"
|
||||
- local: optimization/mps
|
||||
title: "MPS"
|
||||
- local: optimization/habana
|
||||
title: "Habana Gaudi"
|
||||
title: "Optimization/Special Hardware"
|
||||
- sections:
|
||||
- local: training/overview
|
||||
|
||||
@@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant"
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
@@ -83,7 +83,7 @@ torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
|
||||
@@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -76,6 +76,33 @@ Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [im
|
||||
|
||||
[[autodoc]] DPMSolverMultistepScheduler
|
||||
|
||||
#### Heun scheduler inspired by Karras et. al paper
|
||||
|
||||
Algorithm 1 of [Karras et. al](https://arxiv.org/abs/2206.00364).
|
||||
Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
|
||||
|
||||
All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
|
||||
|
||||
[[autodoc]] HeunDiscreteScheduler
|
||||
|
||||
#### DPM Discrete Scheduler inspired by Karras et. al paper
|
||||
|
||||
Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364).
|
||||
Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
|
||||
|
||||
All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
|
||||
|
||||
[[autodoc]] KDPM2DiscreteScheduler
|
||||
|
||||
#### DPM Discrete Scheduler with ancestral sampling inspired by Karras et. al paper
|
||||
|
||||
Inspired by [Karras et. al](https://arxiv.org/abs/2206.00364).
|
||||
Scheduler ported from @crowsonkb's https://github.com/crowsonkb/k-diffusion library:
|
||||
|
||||
All credit for making this scheduler work goes to [Katherine Crowson](https://github.com/crowsonkb/)
|
||||
|
||||
[[autodoc]] KDPM2AncestralDiscreteScheduler
|
||||
|
||||
#### Variance exploding, stochastic sampling from Karras et. al
|
||||
|
||||
Original paper can be found [here](https://arxiv.org/abs/2006.11239).
|
||||
@@ -86,7 +113,6 @@ Original paper can be found [here](https://arxiv.org/abs/2006.11239).
|
||||
|
||||
Original implementation can be found [here](https://arxiv.org/abs/2206.00364).
|
||||
|
||||
|
||||
[[autodoc]] LMSDiscreteScheduler
|
||||
|
||||
#### Pseudo numerical methods for diffusion models (PNDM)
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# How to use Stable Diffusion on Habana Gaudi
|
||||
|
||||
🤗 Diffusers is compatible with Habana Gaudi through 🤗 [Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion).
|
||||
|
||||
## Requirements
|
||||
|
||||
- Optimum Habana 1.3 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it.
|
||||
- SynapseAI 1.7.
|
||||
|
||||
|
||||
## Inference Pipeline
|
||||
|
||||
To generate images with Stable Diffusion 1 and 2 on Gaudi, you need to instantiate two instances:
|
||||
- A pipeline with [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline). This pipeline supports *text-to-image generation*.
|
||||
- A scheduler with [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler). This scheduler has been optimized for Habana Gaudi.
|
||||
|
||||
When initializing the pipeline, you have to specify `use_habana=True` to deploy it on HPUs.
|
||||
Furthermore, in order to get the fastest possible generations you should enable **HPU graphs** with `use_hpu_graphs=True`.
|
||||
Finally, you will need to specify a [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) which can be downloaded from the [Hugging Face Hub](https://huggingface.co/Habana).
|
||||
|
||||
```python
|
||||
from optimum.habana import GaudiConfig
|
||||
from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
|
||||
|
||||
model_name = "stabilityai/stable-diffusion-2-base"
|
||||
scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
|
||||
pipeline = GaudiStableDiffusionPipeline.from_pretrained(
|
||||
model_name,
|
||||
scheduler=scheduler,
|
||||
use_habana=True,
|
||||
use_hpu_graphs=True,
|
||||
gaudi_config="Habana/stable-diffusion",
|
||||
)
|
||||
```
|
||||
|
||||
You can then call the pipeline to generate images by batches from one or several prompts:
|
||||
```python
|
||||
outputs = pipeline(
|
||||
prompt=[
|
||||
"High quality photo of an astronaut riding a horse in space",
|
||||
"Face of a yellow cat, high resolution, sitting on a park bench",
|
||||
],
|
||||
num_images_per_prompt=10,
|
||||
batch_size=4,
|
||||
)
|
||||
```
|
||||
|
||||
For more information, check out Optimum Habana's [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion) and the [example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion) provided in the official Github repository.
|
||||
|
||||
|
||||
## Benchmark
|
||||
|
||||
Here are the latencies for Habana Gaudi 1 and Gaudi 2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi configuration (mixed precision bf16/fp32):
|
||||
|
||||
| | Latency | Batch size |
|
||||
| ------- |:-------:|:----------:|
|
||||
| Gaudi 1 | 4.37s | 4/8 |
|
||||
| Gaudi 2 | 1.19s | 4/8 |
|
||||
@@ -177,7 +177,7 @@ init_image = download_image(
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
### Inpainting
|
||||
|
||||
@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
```
|
||||
|
||||
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
|
||||
|
||||
@@ -37,7 +37,7 @@ init_image.thumbnail((768, 768))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
### Inpainting
|
||||
|
||||
@@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
```
|
||||
|
||||
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
|
||||
@@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = init_image.resize((512, 512))
|
||||
res = pipe.train(
|
||||
prompt,
|
||||
image=init_image,
|
||||
init_image,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
generator=generator)
|
||||
|
||||
@@ -17,7 +17,7 @@ from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import deprecate, logging
|
||||
from diffusers.utils import logging
|
||||
|
||||
# TODO: remove and import from diffusers.utils when the new version of diffusers is released
|
||||
from packaging import version
|
||||
@@ -133,7 +133,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
def train(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 512,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
@@ -184,10 +184,6 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=1,
|
||||
mixed_precision="fp16",
|
||||
@@ -245,14 +241,14 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
lr=embedding_learning_rate,
|
||||
)
|
||||
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
image = image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_image_dist = self.vae.encode(image).latent_dist
|
||||
image_latents = init_latent_image_dist.sample(generator=generator)
|
||||
image_latents = 0.18215 * image_latents
|
||||
init_image = init_image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_image_dist = self.vae.encode(init_image).latent_dist
|
||||
init_image_latents = init_latent_image_dist.sample(generator=generator)
|
||||
init_image_latents = 0.18215 * init_image_latents
|
||||
|
||||
progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
|
||||
progress_bar.set_description("Steps")
|
||||
@@ -263,12 +259,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
for _ in range(text_embedding_optimization_steps):
|
||||
with accelerator.accumulate(text_embeddings):
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn(image_latents.shape).to(image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=image_latents.device)
|
||||
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
|
||||
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
|
||||
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
|
||||
@@ -305,12 +301,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
for _ in range(model_fine_tuning_optimization_steps):
|
||||
with accelerator.accumulate(self.unet.parameters()):
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn(image_latents.shape).to(image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=image_latents.device)
|
||||
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
|
||||
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
|
||||
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
|
||||
|
||||
@@ -488,6 +488,24 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
feature_extractor=feature_extractor,
|
||||
)
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
@@ -537,7 +555,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
@@ -565,11 +583,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -587,11 +605,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
@@ -630,9 +648,6 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
@@ -699,7 +714,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
mask = None
|
||||
noise = None
|
||||
|
||||
if image is None:
|
||||
if init_image is None:
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
@@ -738,11 +753,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
else:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess_image(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess_image(init_image)
|
||||
# encode the init image into latents and scale the latents
|
||||
image = image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
init_image = init_image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
|
||||
@@ -757,7 +772,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
# check sizes
|
||||
if not mask.shape == init_latents.shape:
|
||||
raise ValueError("The mask and image should be the same size!")
|
||||
raise ValueError("The mask and init_image should be the same size!")
|
||||
|
||||
# get the original timestep using init_timestep
|
||||
offset = self.scheduler.config.get("steps_offset", 0)
|
||||
@@ -946,7 +961,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def img2img(
|
||||
self,
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
strength: float = 0.8,
|
||||
@@ -965,7 +980,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for image-to-image generation.
|
||||
Args:
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
prompt (`str` or `List[str]`):
|
||||
@@ -974,11 +989,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -1020,7 +1035,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
strength=strength,
|
||||
@@ -1037,7 +1052,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def inpaint(
|
||||
self,
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -1057,11 +1072,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for inpaint.
|
||||
Args:
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -1073,7 +1088,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
||||
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
||||
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
|
||||
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
|
||||
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
||||
@@ -1116,7 +1131,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
|
||||
@@ -10,7 +10,7 @@ from diffusers.onnx_utils import OnnxRuntimeModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import deprecate, logging
|
||||
from diffusers.utils import logging
|
||||
|
||||
# TODO: remove and import from diffusers.utils when the new version of diffusers is released
|
||||
from packaging import version
|
||||
@@ -441,7 +441,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
init_image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
@@ -469,11 +469,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
mask_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -491,11 +491,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
@@ -533,9 +533,6 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
@@ -601,7 +598,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
mask = None
|
||||
noise = None
|
||||
|
||||
if image is None:
|
||||
if init_image is None:
|
||||
latents_shape = (
|
||||
batch_size * num_images_per_prompt,
|
||||
4,
|
||||
@@ -619,11 +616,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
else:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess_image(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess_image(init_image)
|
||||
# encode the init image into latents and scale the latents
|
||||
image = image.astype(latents_dtype)
|
||||
init_latents = self.vae_encoder(sample=image)[0]
|
||||
init_image = init_image.astype(latents_dtype)
|
||||
init_latents = self.vae_encoder(sample=init_image)[0]
|
||||
init_latents = 0.18215 * init_latents
|
||||
init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
|
||||
init_latents_orig = init_latents
|
||||
@@ -638,7 +635,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
# check sizes
|
||||
if not mask.shape == init_latents.shape:
|
||||
print(mask.shape, init_latents.shape)
|
||||
raise ValueError("The mask and image should be the same size!")
|
||||
raise ValueError("The mask and init_image should be the same size!")
|
||||
|
||||
# get the original timestep using init_timestep
|
||||
offset = self.scheduler.config.get("steps_offset", 0)
|
||||
@@ -831,7 +828,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def img2img(
|
||||
self,
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
strength: float = 0.8,
|
||||
@@ -850,7 +847,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for image-to-image generation.
|
||||
Args:
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or ndarray representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
prompt (`str` or `List[str]`):
|
||||
@@ -859,11 +856,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -904,7 +901,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
strength=strength,
|
||||
@@ -921,7 +918,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def inpaint(
|
||||
self,
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -941,11 +938,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for inpaint.
|
||||
Args:
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -957,7 +954,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
||||
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
||||
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
|
||||
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
|
||||
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
||||
@@ -999,7 +996,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
|
||||
@@ -106,6 +106,24 @@ class StableDiffusionPipeline(DiffusionPipeline):
|
||||
sampling = getattr(library, "sampling")
|
||||
self.sampler = getattr(sampling, scheduler_type)
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
|
||||
@@ -121,7 +121,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
def inpaint(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
@@ -138,7 +138,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
|
||||
return StableDiffusionInpaintPipelineLegacy(**self.components)(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=strength,
|
||||
num_inference_steps=num_inference_steps,
|
||||
@@ -156,7 +156,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
def img2img(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -173,7 +173,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
|
||||
return StableDiffusionImg2ImgPipeline(**self.components)(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
init_image=init_image,
|
||||
strength=strength,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
|
||||
@@ -183,6 +183,24 @@ class TextInpainting(DiffusionPipeline):
|
||||
return torch.device(module._hf_hook.execution_device)
|
||||
return self.device
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
|
||||
@@ -19,13 +19,6 @@ And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) e
|
||||
accelerate config
|
||||
```
|
||||
|
||||
Or if your environment doesn't support an interactive shell e.g. a notebook
|
||||
|
||||
```python
|
||||
from accelerate.utils import write_basic_config
|
||||
write_basic_config()
|
||||
```
|
||||
|
||||
### Dog toy example
|
||||
|
||||
You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
|
||||
@@ -70,7 +63,7 @@ accelerate launch train_dreambooth.py \
|
||||
### Training with prior-preservation loss
|
||||
|
||||
Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
|
||||
According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases. The `num_class_images` flag sets the number of images to generate with the class prompt. You can place existing images in `class_data_dir`, and the training script will generate any additional images so that `num_class_images` are present in `class_data_dir` during training time.
|
||||
According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior-preservation. 200-300 works well for most cases.
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
|
||||
|
||||
@@ -107,8 +107,8 @@ def parse_args(input_args=None):
|
||||
type=int,
|
||||
default=100,
|
||||
help=(
|
||||
"Minimal class images for prior preservation loss. If there are not enough images already present in"
|
||||
" class_data_dir, additional images will be sampled with class_prompt."
|
||||
"Minimal class images for prior preservation loss. If not have enough images, additional images will be"
|
||||
" sampled with class_prompt."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -89,8 +89,8 @@ def parse_args():
|
||||
type=int,
|
||||
default=100,
|
||||
help=(
|
||||
"Minimal class images for prior preservation loss. If there are not enough images already present in"
|
||||
" class_data_dir, additional images will be sampled with class_prompt."
|
||||
"Minimal class images for prior preservation loss. If not have enough images, additional images will be"
|
||||
" sampled with class_prompt."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -14,6 +14,7 @@ from datasets import load_dataset
|
||||
from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel, __version__
|
||||
from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import EMAModel
|
||||
from diffusers.utils import deprecate
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from packaging import version
|
||||
from torchvision.transforms import (
|
||||
@@ -416,7 +417,11 @@ def main(args):
|
||||
scheduler=noise_scheduler,
|
||||
)
|
||||
|
||||
generator = torch.Generator(device=pipeline.device).manual_seed(0)
|
||||
deprecate("todo: remove this check", "0.10.0", "when the most used version is >= 0.8.0")
|
||||
if diffusers_version < version.parse("0.8.0"):
|
||||
generator = torch.manual_seed(0)
|
||||
else:
|
||||
generator = torch.Generator(device=pipeline.device).manual_seed(0)
|
||||
# run pipeline in inference (sample random noise and denoise)
|
||||
images = pipeline(
|
||||
generator=generator,
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@@ -11,9 +9,9 @@ from accelerate import Accelerator
|
||||
from accelerate.logging import get_logger
|
||||
from datasets import load_dataset
|
||||
from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
|
||||
from diffusers.hub_utils import init_git_repo, push_to_hub
|
||||
from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import EMAModel
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from onnxruntime.training.ortmodule import ORTModule
|
||||
from torchvision.transforms import (
|
||||
CenterCrop,
|
||||
@@ -30,16 +28,6 @@ from tqdm.auto import tqdm
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
|
||||
if token is None:
|
||||
token = HfFolder.get_token()
|
||||
if organization is None:
|
||||
username = whoami(token)["name"]
|
||||
return f"{username}/{model_id}"
|
||||
else:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def main(args):
|
||||
logging_dir = os.path.join(args.output_dir, args.logging_dir)
|
||||
accelerator = Accelerator(
|
||||
@@ -125,22 +113,8 @@ def main(args):
|
||||
|
||||
ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)
|
||||
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if args.push_to_hub:
|
||||
repo = init_git_repo(args, at_init=True)
|
||||
|
||||
if accelerator.is_main_process:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
@@ -212,9 +186,10 @@ def main(args):
|
||||
|
||||
if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
|
||||
# save the model
|
||||
pipeline.save_pretrained(args.output_dir)
|
||||
if args.push_to_hub:
|
||||
repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False)
|
||||
push_to_hub(args, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=False)
|
||||
else:
|
||||
pipeline.save_pretrained(args.output_dir)
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
accelerator.end_training()
|
||||
|
||||
@@ -33,7 +33,6 @@ from diffusers import (
|
||||
DPMSolverMultistepScheduler,
|
||||
EulerAncestralDiscreteScheduler,
|
||||
EulerDiscreteScheduler,
|
||||
HeunDiscreteScheduler,
|
||||
LDMTextToImagePipeline,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
@@ -208,12 +207,12 @@ def conv_attn_to_linear(checkpoint):
|
||||
checkpoint[key] = checkpoint[key][:, :, 0]
|
||||
|
||||
|
||||
def create_unet_diffusers_config(original_config, image_size: int):
|
||||
def create_unet_diffusers_config(original_config):
|
||||
"""
|
||||
Creates a config for the diffusers based on the config of the LDM model.
|
||||
"""
|
||||
model_params = original_config.model.params
|
||||
unet_params = original_config.model.params.unet_config.params
|
||||
vae_params = original_config.model.params.first_stage_config.params.ddconfig
|
||||
|
||||
block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
|
||||
|
||||
@@ -231,19 +230,8 @@ def create_unet_diffusers_config(original_config, image_size: int):
|
||||
up_block_types.append(block_type)
|
||||
resolution //= 2
|
||||
|
||||
vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
|
||||
|
||||
head_dim = unet_params.num_heads if "num_heads" in unet_params else None
|
||||
use_linear_projection = (
|
||||
unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
|
||||
)
|
||||
if use_linear_projection:
|
||||
# stable diffusion 2-base-512 and 2-768
|
||||
if head_dim is None:
|
||||
head_dim = [5, 10, 20, 20]
|
||||
|
||||
config = dict(
|
||||
sample_size=image_size // vae_scale_factor,
|
||||
sample_size=model_params.image_size,
|
||||
in_channels=unet_params.in_channels,
|
||||
out_channels=unet_params.out_channels,
|
||||
down_block_types=tuple(down_block_types),
|
||||
@@ -251,14 +239,13 @@ def create_unet_diffusers_config(original_config, image_size: int):
|
||||
block_out_channels=tuple(block_out_channels),
|
||||
layers_per_block=unet_params.num_res_blocks,
|
||||
cross_attention_dim=unet_params.context_dim,
|
||||
attention_head_dim=head_dim,
|
||||
use_linear_projection=use_linear_projection,
|
||||
attention_head_dim=unet_params.num_heads,
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def create_vae_diffusers_config(original_config, image_size: int):
|
||||
def create_vae_diffusers_config(original_config):
|
||||
"""
|
||||
Creates a config for the diffusers based on the config of the LDM model.
|
||||
"""
|
||||
@@ -270,7 +257,7 @@ def create_vae_diffusers_config(original_config, image_size: int):
|
||||
up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
|
||||
|
||||
config = dict(
|
||||
sample_size=image_size,
|
||||
sample_size=vae_params.resolution,
|
||||
in_channels=vae_params.in_channels,
|
||||
out_channels=vae_params.out_ch,
|
||||
down_block_types=tuple(down_block_types),
|
||||
@@ -647,22 +634,6 @@ def convert_ldm_clip_checkpoint(checkpoint):
|
||||
return text_model
|
||||
|
||||
|
||||
def convert_open_clip_checkpoint(checkpoint):
|
||||
text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
|
||||
|
||||
# SKIP for now - need openclip -> HF conversion script here
|
||||
# keys = list(checkpoint.keys())
|
||||
#
|
||||
# text_model_dict = {}
|
||||
# for key in keys:
|
||||
# if key.startswith("cond_stage_model.model.transformer"):
|
||||
# text_model_dict[key[len("cond_stage_model.model.transformer.") :]] = checkpoint[key]
|
||||
#
|
||||
# text_model.load_state_dict(text_model_dict)
|
||||
|
||||
return text_model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@@ -682,24 +653,6 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_size",
|
||||
default=None,
|
||||
type=int,
|
||||
help=(
|
||||
"The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
|
||||
" Base. Use 768 for Stable Diffusion v2."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prediction_type",
|
||||
default=None,
|
||||
type=int,
|
||||
help=(
|
||||
"The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
|
||||
" Siffusion v2 Base. Use 'v-prediction' for Stable Diffusion v2."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract_ema",
|
||||
action="store_true",
|
||||
@@ -710,15 +663,13 @@ if __name__ == "__main__":
|
||||
),
|
||||
)
|
||||
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
image_size = args.image_size
|
||||
prediction_type = args.prediction_type
|
||||
|
||||
checkpoint = torch.load(args.checkpoint_path)
|
||||
global_step = checkpoint["global_step"]
|
||||
checkpoint = checkpoint["state_dict"]
|
||||
|
||||
prediction_type = "epsilon"
|
||||
if args.original_config_file is None:
|
||||
key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
|
||||
|
||||
@@ -728,8 +679,9 @@ if __name__ == "__main__":
|
||||
"wget https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
|
||||
)
|
||||
args.original_config_file = "./v2-inference-v.yaml"
|
||||
prediction_type
|
||||
else:
|
||||
# model_type = "v1"
|
||||
# model_type = "v2"
|
||||
os.system(
|
||||
"wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
|
||||
)
|
||||
@@ -737,69 +689,51 @@ if __name__ == "__main__":
|
||||
|
||||
original_config = OmegaConf.load(args.original_config_file)
|
||||
|
||||
if (
|
||||
"parameterization" in original_config["model"]["params"]
|
||||
and original_config["model"]["params"]["parameterization"] == "v"
|
||||
):
|
||||
if prediction_type is None:
|
||||
# NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
|
||||
# as it relies on a brittle global step parameter here
|
||||
prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
|
||||
if image_size is None:
|
||||
# NOTE: For stable diffusion 2 base one has to pass `image_size==512`
|
||||
# as it relies on a brittle global step parameter here
|
||||
image_size = 512 if global_step == 875000 else 768
|
||||
else:
|
||||
if prediction_type is None:
|
||||
prediction_type = "epsilon"
|
||||
if image_size is None:
|
||||
image_size = 512
|
||||
|
||||
num_train_timesteps = original_config.model.params.timesteps
|
||||
beta_start = original_config.model.params.linear_start
|
||||
beta_end = original_config.model.params.linear_end
|
||||
|
||||
scheduler = DDIMScheduler(
|
||||
beta_end=beta_end,
|
||||
beta_schedule="scaled_linear",
|
||||
beta_start=beta_start,
|
||||
num_train_timesteps=num_train_timesteps,
|
||||
steps_offset=1,
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
prediction_type=prediction_type,
|
||||
)
|
||||
if args.scheduler_type == "pndm":
|
||||
config = dict(scheduler.config)
|
||||
config["skip_prk_steps"] = True
|
||||
scheduler = PNDMScheduler.from_config(config)
|
||||
scheduler = PNDMScheduler(
|
||||
beta_end=beta_end,
|
||||
beta_schedule="scaled_linear",
|
||||
beta_start=beta_start,
|
||||
num_train_timesteps=num_train_timesteps,
|
||||
skip_prk_steps=True,
|
||||
)
|
||||
elif args.scheduler_type == "lms":
|
||||
scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
|
||||
elif args.scheduler_type == "heun":
|
||||
scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
|
||||
scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
|
||||
elif args.scheduler_type == "euler":
|
||||
scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
|
||||
scheduler = EulerDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
|
||||
elif args.scheduler_type == "euler-ancestral":
|
||||
scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
|
||||
scheduler = EulerAncestralDiscreteScheduler(
|
||||
beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
|
||||
)
|
||||
elif args.scheduler_type == "dpm":
|
||||
scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
|
||||
scheduler = DPMSolverMultistepScheduler(
|
||||
beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
|
||||
)
|
||||
elif args.scheduler_type == "ddim":
|
||||
scheduler = scheduler
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=beta_start,
|
||||
beta_end=beta_end,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
|
||||
|
||||
# Convert the UNet2DConditionModel model.
|
||||
unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
|
||||
unet = UNet2DConditionModel(**unet_config)
|
||||
|
||||
unet_config = create_unet_diffusers_config(original_config)
|
||||
converted_unet_checkpoint = convert_ldm_unet_checkpoint(
|
||||
checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
|
||||
)
|
||||
|
||||
unet = UNet2DConditionModel(**unet_config)
|
||||
unet.load_state_dict(converted_unet_checkpoint)
|
||||
|
||||
# Convert the VAE model.
|
||||
vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
|
||||
vae_config = create_vae_diffusers_config(original_config)
|
||||
converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
|
||||
|
||||
vae = AutoencoderKL(**vae_config)
|
||||
@@ -807,20 +741,7 @@ if __name__ == "__main__":
|
||||
|
||||
# Convert the text model.
|
||||
text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
|
||||
if text_model_type == "FrozenOpenCLIPEmbedder":
|
||||
text_model = convert_open_clip_checkpoint(checkpoint)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
|
||||
pipe = StableDiffusionPipeline(
|
||||
vae=vae,
|
||||
text_encoder=text_model,
|
||||
tokenizer=tokenizer,
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
safety_checker=None,
|
||||
feature_extractor=None,
|
||||
requires_safety_checker=False,
|
||||
)
|
||||
elif text_model_type == "FrozenCLIPEmbedder":
|
||||
if text_model_type == "FrozenCLIPEmbedder":
|
||||
text_model = convert_ldm_clip_checkpoint(checkpoint)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
||||
safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
|
||||
|
||||
70
scripts/v1-inference.yaml
Normal file
70
scripts/v1-inference.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false # Note: different from the one we trained before
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 10000 ]
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1. ]
|
||||
f_min: [ 1. ]
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: True
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
2
setup.py
2
setup.py
@@ -214,7 +214,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="diffusers",
|
||||
version="0.10.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="0.9.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
description="Diffusers",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
@@ -9,7 +9,7 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
__version__ = "0.10.0.dev0"
|
||||
__version__ = "0.9.0"
|
||||
|
||||
from .configuration_utils import ConfigMixin
|
||||
from .onnx_utils import OnnxRuntimeModel
|
||||
@@ -49,6 +49,8 @@ if is_torch_available():
|
||||
HeunDiscreteScheduler,
|
||||
IPNDMScheduler,
|
||||
KarrasVeScheduler,
|
||||
KDPM2AncestralDiscreteScheduler,
|
||||
KDPM2DiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
RePaintScheduler,
|
||||
SchedulerMixin,
|
||||
|
||||
@@ -15,15 +15,16 @@
|
||||
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Union
|
||||
from uuid import uuid4
|
||||
|
||||
from huggingface_hub import HfFolder, whoami
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
|
||||
from . import __version__
|
||||
from .utils import ENV_VARS_TRUE_VALUES, logging
|
||||
from .utils import ENV_VARS_TRUE_VALUES, deprecate, logging
|
||||
from .utils.import_utils import (
|
||||
_flax_version,
|
||||
_jax_version,
|
||||
@@ -82,6 +83,121 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def init_git_repo(args, at_init: bool = False):
|
||||
"""
|
||||
Args:
|
||||
Initializes a git repo in `args.hub_model_id`.
|
||||
at_init (`bool`, *optional*, defaults to `False`):
|
||||
Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
|
||||
and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
|
||||
"""
|
||||
deprecation_message = (
|
||||
"Please use `huggingface_hub.Repository`. "
|
||||
"See `examples/unconditional_image_generation/train_unconditional.py` for an example."
|
||||
)
|
||||
deprecate("init_git_repo()", "0.10.0", deprecation_message)
|
||||
|
||||
if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
|
||||
return
|
||||
hub_token = args.hub_token if hasattr(args, "hub_token") else None
|
||||
use_auth_token = True if hub_token is None else hub_token
|
||||
if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
|
||||
repo_name = Path(args.output_dir).absolute().name
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
if "/" not in repo_name:
|
||||
repo_name = get_full_repo_name(repo_name, token=hub_token)
|
||||
|
||||
try:
|
||||
repo = Repository(
|
||||
args.output_dir,
|
||||
clone_from=repo_name,
|
||||
use_auth_token=use_auth_token,
|
||||
private=args.hub_private_repo,
|
||||
)
|
||||
except EnvironmentError:
|
||||
if args.overwrite_output_dir and at_init:
|
||||
# Try again after wiping output_dir
|
||||
shutil.rmtree(args.output_dir)
|
||||
repo = Repository(
|
||||
args.output_dir,
|
||||
clone_from=repo_name,
|
||||
use_auth_token=use_auth_token,
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
repo.git_pull()
|
||||
|
||||
# By default, ignore the checkpoint folders
|
||||
if not os.path.exists(os.path.join(args.output_dir, ".gitignore")):
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
|
||||
writer.writelines(["checkpoint-*/"])
|
||||
|
||||
return repo
|
||||
|
||||
|
||||
def push_to_hub(
|
||||
args,
|
||||
pipeline,
|
||||
repo: Repository,
|
||||
commit_message: Optional[str] = "End of training",
|
||||
blocking: bool = True,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Parameters:
|
||||
Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
|
||||
commit_message (`str`, *optional*, defaults to `"End of training"`):
|
||||
Message to commit while pushing.
|
||||
blocking (`bool`, *optional*, defaults to `True`):
|
||||
Whether the function should return only when the `git push` has finished.
|
||||
kwargs:
|
||||
Additional keyword arguments passed along to [`create_model_card`].
|
||||
Returns:
|
||||
The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
|
||||
commit and an object to track the progress of the commit if `blocking=True`
|
||||
"""
|
||||
deprecation_message = (
|
||||
"Please use `huggingface_hub.Repository` and `Repository.push_to_hub()`. "
|
||||
"See `examples/unconditional_image_generation/train_unconditional.py` for an example."
|
||||
)
|
||||
deprecate("push_to_hub()", "0.10.0", deprecation_message)
|
||||
|
||||
if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
|
||||
model_name = Path(args.output_dir).name
|
||||
else:
|
||||
model_name = args.hub_model_id.split("/")[-1]
|
||||
|
||||
output_dir = args.output_dir
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logger.info(f"Saving pipeline checkpoint to {output_dir}")
|
||||
pipeline.save_pretrained(output_dir)
|
||||
|
||||
# Only push from one node.
|
||||
if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
|
||||
return
|
||||
|
||||
# Cancel any async push in progress if blocking=True. The commits will all be pushed together.
|
||||
if (
|
||||
blocking
|
||||
and len(repo.command_queue) > 0
|
||||
and repo.command_queue[-1] is not None
|
||||
and not repo.command_queue[-1].is_done
|
||||
):
|
||||
repo.command_queue[-1]._process.kill()
|
||||
|
||||
git_head_commit_url = repo.push_to_hub(commit_message=commit_message, blocking=blocking, auto_lfs_prune=True)
|
||||
# push separately the model card to be independent from the rest of the model
|
||||
create_model_card(args, model_name=model_name)
|
||||
try:
|
||||
repo.push_to_hub(commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True)
|
||||
except EnvironmentError as exc:
|
||||
logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
|
||||
|
||||
return git_head_commit_url
|
||||
|
||||
|
||||
def create_model_card(args, model_name):
|
||||
if not is_modelcards_available:
|
||||
raise ValueError(
|
||||
|
||||
@@ -666,6 +666,20 @@ class ModelMixin(torch.nn.Module):
|
||||
return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
|
||||
|
||||
|
||||
def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
|
||||
"""
|
||||
Recursively unwraps a model from potential containers (as used in distributed training).
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to unwrap.
|
||||
"""
|
||||
# since there could be multiple levels of wrapping, unwrap recursively
|
||||
if hasattr(model, "module"):
|
||||
return unwrap_model(model.module)
|
||||
else:
|
||||
return model
|
||||
|
||||
|
||||
def _get_model_file(
|
||||
pretrained_model_name_or_path,
|
||||
*,
|
||||
|
||||
@@ -246,6 +246,10 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
|
||||
|
||||
return Transformer2DModelOutput(sample=output)
|
||||
|
||||
def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for block in self.transformer_blocks:
|
||||
block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
|
||||
class AttentionBlock(nn.Module):
|
||||
"""
|
||||
@@ -286,19 +290,11 @@ class AttentionBlock(nn.Module):
|
||||
self.rescale_output_factor = rescale_output_factor
|
||||
self.proj_attn = nn.Linear(channels, channels, 1)
|
||||
|
||||
def reshape_heads_to_batch_dim(self, tensor):
|
||||
batch_size, seq_len, dim = tensor.shape
|
||||
head_size = self.num_heads
|
||||
tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
|
||||
tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
|
||||
return tensor
|
||||
|
||||
def reshape_batch_dim_to_heads(self, tensor):
|
||||
batch_size, seq_len, dim = tensor.shape
|
||||
head_size = self.num_heads
|
||||
tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
|
||||
tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
|
||||
return tensor
|
||||
def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
|
||||
new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
|
||||
# move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
|
||||
new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
|
||||
return new_projection
|
||||
|
||||
def forward(self, hidden_states):
|
||||
residual = hidden_states
|
||||
@@ -316,28 +312,50 @@ class AttentionBlock(nn.Module):
|
||||
|
||||
scale = 1 / math.sqrt(self.channels / self.num_heads)
|
||||
|
||||
query_proj = self.reshape_heads_to_batch_dim(query_proj)
|
||||
key_proj = self.reshape_heads_to_batch_dim(key_proj)
|
||||
value_proj = self.reshape_heads_to_batch_dim(value_proj)
|
||||
# get scores
|
||||
if self.num_heads > 1:
|
||||
query_states = self.transpose_for_scores(query_proj)
|
||||
key_states = self.transpose_for_scores(key_proj)
|
||||
value_states = self.transpose_for_scores(value_proj)
|
||||
|
||||
# TODO: is there a way to perform batched matmul (e.g. baddbmm) on 4D tensors?
|
||||
# or reformulate this into a 3D problem?
|
||||
# TODO: measure whether on MPS device it would be faster to do this matmul via einsum
|
||||
# as some matmuls can be 1.94x slower than an equivalent einsum on MPS
|
||||
# https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
|
||||
attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * scale
|
||||
else:
|
||||
query_states, key_states, value_states = query_proj, key_proj, value_proj
|
||||
|
||||
attention_scores = torch.baddbmm(
|
||||
torch.empty(
|
||||
query_states.shape[0],
|
||||
query_states.shape[1],
|
||||
key_states.shape[1],
|
||||
dtype=query_states.dtype,
|
||||
device=query_states.device,
|
||||
),
|
||||
query_states,
|
||||
key_states.transpose(-1, -2),
|
||||
beta=0,
|
||||
alpha=scale,
|
||||
)
|
||||
|
||||
attention_scores = torch.baddbmm(
|
||||
torch.empty(
|
||||
query_proj.shape[0],
|
||||
query_proj.shape[1],
|
||||
key_proj.shape[1],
|
||||
dtype=query_proj.dtype,
|
||||
device=query_proj.device,
|
||||
),
|
||||
query_proj,
|
||||
key_proj.transpose(-1, -2),
|
||||
beta=0,
|
||||
alpha=scale,
|
||||
)
|
||||
attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
|
||||
hidden_states = torch.bmm(attention_probs, value_proj)
|
||||
|
||||
# reshape hidden_states
|
||||
hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
|
||||
# compute attention output
|
||||
if self.num_heads > 1:
|
||||
# TODO: is there a way to perform batched matmul (e.g. bmm) on 4D tensors?
|
||||
# or reformulate this into a 3D problem?
|
||||
# TODO: measure whether on MPS device it would be faster to do this matmul via einsum
|
||||
# as some matmuls can be 1.94x slower than an equivalent einsum on MPS
|
||||
# https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
|
||||
hidden_states = torch.matmul(attention_probs, value_states)
|
||||
hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
|
||||
new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
|
||||
hidden_states = hidden_states.view(new_hidden_states_shape)
|
||||
else:
|
||||
hidden_states = torch.bmm(attention_probs, value_states)
|
||||
|
||||
# compute next hidden_states
|
||||
hidden_states = self.proj_attn(hidden_states)
|
||||
@@ -410,7 +428,7 @@ class BasicTransformerBlock(nn.Module):
|
||||
# if xformers is installed try to use memory_efficient_attention by default
|
||||
if is_xformers_available():
|
||||
try:
|
||||
self.set_use_memory_efficient_attention_xformers(True)
|
||||
self._set_use_memory_efficient_attention_xformers(True)
|
||||
except Exception as e:
|
||||
warnings.warn(
|
||||
"Could not enable memory efficient attention. Make sure xformers is installed"
|
||||
@@ -421,7 +439,7 @@ class BasicTransformerBlock(nn.Module):
|
||||
self.attn1._slice_size = slice_size
|
||||
self.attn2._slice_size = slice_size
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
if not is_xformers_available():
|
||||
print("Here is how to install it")
|
||||
raise ModuleNotFoundError(
|
||||
@@ -831,3 +849,11 @@ class DualTransformer2DModel(nn.Module):
|
||||
return (output_states,)
|
||||
|
||||
return Transformer2DModelOutput(sample=output_states)
|
||||
|
||||
def _set_attention_slice(self, slice_size):
|
||||
for transformer in self.transformers:
|
||||
transformer._set_attention_slice(slice_size)
|
||||
|
||||
def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for transformer in self.transformers:
|
||||
transformer._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
@@ -84,11 +84,10 @@ class FlaxTimesteps(nn.Module):
|
||||
Time step embedding dimension
|
||||
"""
|
||||
dim: int = 32
|
||||
flip_sin_to_cos: bool = False
|
||||
freq_shift: float = 1
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, timesteps):
|
||||
return get_sinusoidal_embeddings(
|
||||
timesteps, embedding_dim=self.dim, flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.freq_shift
|
||||
timesteps, embedding_dim=self.dim, freq_shift=self.freq_shift, flip_sin_to_cos=True
|
||||
)
|
||||
|
||||
@@ -418,6 +418,10 @@ class UNetMidBlock2DCrossAttn(nn.Module):
|
||||
for attn in self.attentions:
|
||||
attn._set_attention_slice(slice_size)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for attn in self.attentions:
|
||||
attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
|
||||
hidden_states = self.resnets[0](hidden_states, temb)
|
||||
for attn, resnet in zip(self.attentions, self.resnets[1:]):
|
||||
@@ -612,6 +616,10 @@ class CrossAttnDownBlock2D(nn.Module):
|
||||
for attn in self.attentions:
|
||||
attn._set_attention_slice(slice_size)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for attn in self.attentions:
|
||||
attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
|
||||
output_states = ()
|
||||
|
||||
@@ -1209,6 +1217,10 @@ class CrossAttnUpBlock2D(nn.Module):
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for attn in self.attentions:
|
||||
attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
|
||||
@@ -252,6 +252,17 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_attention_slice(slice_size)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for block in self.down_blocks:
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
for block in self.up_blocks:
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
|
||||
module.gradient_checkpointing = value
|
||||
@@ -299,14 +310,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
|
||||
timesteps = timestep
|
||||
if not torch.is_tensor(timesteps):
|
||||
# TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
|
||||
# This would be a good case for the `match` statement (Python 3.10+)
|
||||
is_mps = sample.device.type == "mps"
|
||||
if torch.is_floating_point(timesteps):
|
||||
dtype = torch.float32 if is_mps else torch.float64
|
||||
else:
|
||||
dtype = torch.int32 if is_mps else torch.int64
|
||||
timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
|
||||
elif len(timesteps.shape) == 0:
|
||||
timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
|
||||
elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
|
||||
timesteps = timesteps[None].to(sample.device)
|
||||
|
||||
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
||||
|
||||
@@ -85,10 +85,6 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
|
||||
The dimension of the cross attention features.
|
||||
dropout (`float`, *optional*, defaults to 0):
|
||||
Dropout probability for down, up and bottleneck blocks.
|
||||
flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
|
||||
Whether to flip the sin to cos in the time embedding.
|
||||
freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
|
||||
|
||||
"""
|
||||
|
||||
sample_size: int = 32
|
||||
@@ -109,7 +105,6 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
|
||||
dropout: float = 0.0
|
||||
use_linear_projection: bool = False
|
||||
dtype: jnp.dtype = jnp.float32
|
||||
flip_sin_to_cos: bool = True
|
||||
freq_shift: int = 0
|
||||
|
||||
def init_weights(self, rng: jax.random.PRNGKey) -> FrozenDict:
|
||||
@@ -138,9 +133,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
|
||||
)
|
||||
|
||||
# time
|
||||
self.time_proj = FlaxTimesteps(
|
||||
block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
|
||||
)
|
||||
self.time_proj = FlaxTimesteps(block_out_channels[0], freq_shift=self.config.freq_shift)
|
||||
self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
|
||||
|
||||
only_cross_attention = self.only_cross_attention
|
||||
|
||||
@@ -603,163 +603,17 @@ class AutoencoderKL(ModelMixin, ConfigMixin):
|
||||
self.use_slicing = False
|
||||
|
||||
def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
|
||||
# if self.use_slicing and z.shape[0] > 1:
|
||||
# decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
|
||||
# decoded = torch.cat(decoded_slices)
|
||||
# else:
|
||||
# decoded = self._decode(z).sample
|
||||
|
||||
decoded = self.split_decode(z)
|
||||
if self.use_slicing and z.shape[0] > 1:
|
||||
decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
|
||||
decoded = torch.cat(decoded_slices)
|
||||
else:
|
||||
decoded = self._decode(z).sample
|
||||
|
||||
if not return_dict:
|
||||
return (decoded,)
|
||||
|
||||
return DecoderOutput(sample=decoded)
|
||||
|
||||
def meshgrid(self, h, w):
|
||||
y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
|
||||
x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
|
||||
|
||||
arr = torch.cat([y, x], dim=-1)
|
||||
return arr
|
||||
|
||||
def delta_border(self, h, w):
|
||||
"""
|
||||
:param h: height :param w: width :return: normalized distance to image border,
|
||||
wtith min distance = 0 at border and max dist = 0.5 at image center
|
||||
"""
|
||||
lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
|
||||
arr = self.meshgrid(h, w) / lower_right_corner
|
||||
dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
|
||||
dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
|
||||
edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
|
||||
return edge_dist
|
||||
|
||||
def get_weighting(self, h, w, Ly, Lx, device):
|
||||
weighting = self.delta_border(h, w)
|
||||
weighting = torch.clip(
|
||||
weighting,
|
||||
self.split_input_params["clip_min_weight"],
|
||||
self.split_input_params["clip_max_weight"],
|
||||
)
|
||||
weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
|
||||
|
||||
if self.split_input_params["tie_braker"]:
|
||||
L_weighting = self.delta_border(Ly, Lx)
|
||||
L_weighting = torch.clip(
|
||||
L_weighting,
|
||||
self.split_input_params["clip_min_tie_weight"],
|
||||
self.split_input_params["clip_max_tie_weight"],
|
||||
)
|
||||
|
||||
L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
|
||||
weighting = weighting * L_weighting
|
||||
return weighting
|
||||
|
||||
def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1): # todo load once not every time, shorten code
|
||||
"""
|
||||
:param x: img of size (bs, c, h, w) :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
|
||||
"""
|
||||
bs, nc, h, w = x.shape
|
||||
|
||||
# number of crops in image
|
||||
Ly = (h - kernel_size[0]) // stride[0] + 1
|
||||
Lx = (w - kernel_size[1]) // stride[1] + 1
|
||||
|
||||
if uf == 1 and df == 1:
|
||||
fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
|
||||
unfold = torch.nn.Unfold(**fold_params)
|
||||
|
||||
fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
|
||||
|
||||
weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
|
||||
normalization = fold(weighting).view(1, 1, h, w) # normalizes the overlap
|
||||
weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
|
||||
|
||||
elif uf > 1 and df == 1:
|
||||
fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
|
||||
unfold = torch.nn.Unfold(**fold_params)
|
||||
|
||||
fold_params2 = dict(
|
||||
kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
|
||||
dilation=1,
|
||||
padding=0,
|
||||
stride=(stride[0] * uf, stride[1] * uf),
|
||||
)
|
||||
fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
|
||||
|
||||
weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
|
||||
normalization = fold(weighting).view(1, 1, h * uf, w * uf) # normalizes the overlap
|
||||
weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
|
||||
|
||||
elif df > 1 and uf == 1:
|
||||
fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
|
||||
unfold = torch.nn.Unfold(**fold_params)
|
||||
|
||||
fold_params2 = dict(
|
||||
kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
|
||||
dilation=1,
|
||||
padding=0,
|
||||
stride=(stride[0] // df, stride[1] // df),
|
||||
)
|
||||
fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
|
||||
|
||||
weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
|
||||
normalization = fold(weighting).view(1, 1, h // df, w // df) # normalizes the overlap
|
||||
weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
|
||||
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
return fold, unfold, normalization, weighting
|
||||
|
||||
def split_decode(self, z: torch.FloatTensor) -> torch.FloatTensor:
|
||||
ks = 128
|
||||
stride = 64
|
||||
vqf = 2 ** (len(self.config.block_out_channels) - 1)
|
||||
self.split_input_params = {
|
||||
"ks": (ks, ks),
|
||||
"stride": (stride, stride),
|
||||
"vqf": vqf,
|
||||
"patch_distributed_vq": True,
|
||||
"tie_braker": False,
|
||||
"clip_max_weight": 0.5,
|
||||
"clip_min_weight": 0.01,
|
||||
"clip_max_tie_weight": 0.5,
|
||||
"clip_min_tie_weight": 0.01,
|
||||
}
|
||||
|
||||
ks = self.split_input_params["ks"] # eg. (128, 128)
|
||||
stride = self.split_input_params["stride"] # eg. (64, 64)
|
||||
uf = self.split_input_params["vqf"]
|
||||
bs, nc, h, w = z.shape
|
||||
if ks[0] > h or ks[1] > w:
|
||||
ks = (min(ks[0], h), min(ks[1], w))
|
||||
print("reducing Kernel")
|
||||
|
||||
if stride[0] > h or stride[1] > w:
|
||||
stride = (min(stride[0], h), min(stride[1], w))
|
||||
print("reducing stride")
|
||||
|
||||
fold, unfold, normalization, weighting = self.get_fold_unfold(z, ks, stride, uf=vqf)
|
||||
|
||||
z = unfold(z) # (bn, nc * prod(**ks), L)
|
||||
# 1. Reshape to img shape
|
||||
z = z.view((z.shape[0], -1, ks[0], ks[1], z.shape[-1])) # (bn, nc, ks[0], ks[1], L )
|
||||
|
||||
# 2. apply model loop over last dim
|
||||
|
||||
output_list = [self._decode(z[:, :, :, :, i]).sample for i in range(z.shape[-1])]
|
||||
|
||||
o = torch.stack(output_list, axis=-1) # # (bn, nc, ks[0], ks[1], L)
|
||||
o = o * weighting
|
||||
# Reverse 1. reshape to img shape
|
||||
o = o.view((o.shape[0], -1, o.shape[-1])) # (bn, nc * ks[0] * ks[1], L)
|
||||
# stitch crops together
|
||||
decoded = fold(o)
|
||||
decoded = decoded / normalization # norm is shape (1, 1, h, w)
|
||||
return decoded
|
||||
|
||||
def forward(
|
||||
self,
|
||||
sample: torch.FloatTensor,
|
||||
|
||||
@@ -377,8 +377,7 @@ class DiffusionPipeline(ConfigMixin):
|
||||
also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
|
||||
model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
|
||||
setting this argument to `True` will raise an error.
|
||||
return_cached_folder (`bool`, *optional*, defaults to `False`):
|
||||
If set to `True`, path to downloaded cached folder will be returned in addition to loaded pipeline.
|
||||
|
||||
kwargs (remaining dictionary of keyword arguments, *optional*):
|
||||
Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
|
||||
specific pipeline class. The overwritten components are then directly passed to the pipelines
|
||||
@@ -431,7 +430,33 @@ class DiffusionPipeline(ConfigMixin):
|
||||
sess_options = kwargs.pop("sess_options", None)
|
||||
device_map = kwargs.pop("device_map", None)
|
||||
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
|
||||
return_cached_folder = kwargs.pop("return_cached_folder", False)
|
||||
|
||||
if low_cpu_mem_usage and not is_accelerate_available():
|
||||
low_cpu_mem_usage = False
|
||||
logger.warning(
|
||||
"Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
|
||||
" environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
|
||||
" `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
|
||||
" install accelerate\n```\n."
|
||||
)
|
||||
|
||||
if device_map is not None and not is_torch_version(">=", "1.9.0"):
|
||||
raise NotImplementedError(
|
||||
"Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
|
||||
" `device_map=None`."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
|
||||
raise NotImplementedError(
|
||||
"Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
|
||||
" `low_cpu_mem_usage=False`."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage is False and device_map is not None:
|
||||
raise ValueError(
|
||||
f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
|
||||
" dispatching. Please make sure to set `low_cpu_mem_usage=True`."
|
||||
)
|
||||
|
||||
# 1. Download the checkpoints and configs
|
||||
# use snapshot download here to get it working from from_pretrained
|
||||
@@ -560,33 +585,6 @@ class DiffusionPipeline(ConfigMixin):
|
||||
f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage and not is_accelerate_available():
|
||||
low_cpu_mem_usage = False
|
||||
logger.warning(
|
||||
"Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
|
||||
" environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
|
||||
" `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
|
||||
" install accelerate\n```\n."
|
||||
)
|
||||
|
||||
if device_map is not None and not is_torch_version(">=", "1.9.0"):
|
||||
raise NotImplementedError(
|
||||
"Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
|
||||
" `device_map=None`."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
|
||||
raise NotImplementedError(
|
||||
"Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
|
||||
" `low_cpu_mem_usage=False`."
|
||||
)
|
||||
|
||||
if low_cpu_mem_usage is False and device_map is not None:
|
||||
raise ValueError(
|
||||
f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
|
||||
" dispatching. Please make sure to set `low_cpu_mem_usage=True`."
|
||||
)
|
||||
|
||||
# import it here to avoid circular import
|
||||
from diffusers import pipelines
|
||||
|
||||
@@ -706,9 +704,6 @@ class DiffusionPipeline(ConfigMixin):
|
||||
|
||||
# 5. Instantiate the pipeline
|
||||
model = pipeline_class(**init_kwargs)
|
||||
|
||||
if return_cached_folder:
|
||||
return model, cached_folder
|
||||
return model
|
||||
|
||||
@staticmethod
|
||||
@@ -789,38 +784,3 @@ class DiffusionPipeline(ConfigMixin):
|
||||
|
||||
def set_progress_bar_config(self, **kwargs):
|
||||
self._progress_bar_config = kwargs
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, valid: bool) -> None:
|
||||
# Recursively walk through all the children.
|
||||
# Any children which exposes the set_use_memory_efficient_attention_xformers method
|
||||
# gets the message
|
||||
def fn_recursive_set_mem_eff(module: torch.nn.Module):
|
||||
if hasattr(module, "set_use_memory_efficient_attention_xformers"):
|
||||
module.set_use_memory_efficient_attention_xformers(valid)
|
||||
|
||||
for child in module.children():
|
||||
fn_recursive_set_mem_eff(child)
|
||||
|
||||
module_names, _, _ = self.extract_init_dict(dict(self.config))
|
||||
for module_name in module_names:
|
||||
module = getattr(self, module_name)
|
||||
if isinstance(module, torch.nn.Module):
|
||||
fn_recursive_set_mem_eff(module)
|
||||
|
||||
@@ -126,7 +126,7 @@ init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -166,6 +166,24 @@ class AltDiffusionPipeline(DiffusionPipeline):
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
@@ -558,7 +576,7 @@ class AltDiffusionPipeline(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -251,6 +251,24 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return torch.device(module._hf_hook.execution_device)
|
||||
return self.device
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
|
||||
r"""
|
||||
Encodes the prompt into text encoder hidden states.
|
||||
@@ -417,9 +435,9 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
init_image = init_image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -427,16 +445,16 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
||||
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
|
||||
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
|
||||
@@ -454,7 +472,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -466,7 +484,6 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -474,15 +491,15 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -523,10 +540,6 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -544,8 +557,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
# 5. set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
@@ -554,7 +567,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
# 6. Prepare latent variables
|
||||
latents = self.prepare_latents(
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
|
||||
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
@@ -580,7 +593,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -73,7 +73,7 @@ class DDPMPipeline(DiffusionPipeline):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
|
||||
if predict_epsilon is not None:
|
||||
new_config = dict(self.scheduler.config)
|
||||
|
||||
@@ -17,7 +17,7 @@ from ...schedulers import (
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
)
|
||||
from ...utils import PIL_INTERPOLATION, deprecate
|
||||
from ...utils import PIL_INTERPOLATION
|
||||
|
||||
|
||||
def preprocess(image):
|
||||
@@ -66,7 +66,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
image: Union[torch.Tensor, PIL.Image.Image],
|
||||
init_image: Union[torch.Tensor, PIL.Image.Image],
|
||||
batch_size: Optional[int] = 1,
|
||||
num_inference_steps: Optional[int] = 100,
|
||||
eta: Optional[float] = 0.0,
|
||||
@@ -77,7 +77,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
) -> Union[Tuple, ImagePipelineOutput]:
|
||||
r"""
|
||||
Args:
|
||||
image (`torch.Tensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.Tensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
batch_size (`int`, *optional*, defaults to 1):
|
||||
@@ -102,21 +102,20 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
`return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
|
||||
generated images.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
batch_size = 1
|
||||
elif isinstance(image, torch.Tensor):
|
||||
batch_size = image.shape[0]
|
||||
elif isinstance(init_image, torch.Tensor):
|
||||
batch_size = init_image.shape[0]
|
||||
else:
|
||||
raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
|
||||
raise ValueError(
|
||||
f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}"
|
||||
)
|
||||
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
height, width = image.shape[-2:]
|
||||
height, width = init_image.shape[-2:]
|
||||
|
||||
# in_channels should be 6: 3 for latents, 3 for low resolution image
|
||||
latents_shape = (batch_size, self.unet.in_channels // 2, height, width)
|
||||
@@ -129,7 +128,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
else:
|
||||
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
|
||||
|
||||
image = image.to(device=self.device, dtype=latents_dtype)
|
||||
init_image = init_image.to(device=self.device, dtype=latents_dtype)
|
||||
|
||||
# set timesteps and move to the correct device
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=self.device)
|
||||
@@ -149,7 +148,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
|
||||
for t in self.progress_bar(timesteps_tensor):
|
||||
# concat latents and low resolution image in the channel dimension.
|
||||
latents_input = torch.cat([latents, image], dim=1)
|
||||
latents_input = torch.cat([latents, init_image], dim=1)
|
||||
latents_input = self.scheduler.scale_model_input(latents_input, t)
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(latents_input, t).sample
|
||||
|
||||
@@ -138,7 +138,7 @@ prompt = "An astronaut riding an elephant"
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
@@ -164,7 +164,7 @@ torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
|
||||
@@ -285,6 +285,26 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
return torch.device(module._hf_hook.execution_device)
|
||||
return self.device
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
||||
def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
|
||||
r"""
|
||||
@@ -457,9 +477,9 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
init_image = init_image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -467,16 +487,16 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
||||
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
|
||||
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
|
||||
@@ -496,7 +516,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
source_prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -508,7 +528,6 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -516,15 +535,15 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -565,10 +584,6 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -587,8 +602,8 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
# 5. Prepare timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
@@ -597,7 +612,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
|
||||
# 6. Prepare latent variables
|
||||
latents, clean_latents = self.prepare_latents(
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
source_latents = latents
|
||||
|
||||
@@ -666,7 +681,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -229,7 +229,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -241,7 +241,6 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -249,15 +248,15 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -297,10 +296,6 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
@@ -325,8 +320,8 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
@@ -338,9 +333,9 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
image = image.astype(latents_dtype)
|
||||
init_image = init_image.astype(latents_dtype)
|
||||
# encode the init image into latents and scale the latents
|
||||
init_latents = self.vae_encoder(sample=image)[0]
|
||||
init_latents = self.vae_encoder(sample=init_image)[0]
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
if isinstance(prompt, str):
|
||||
@@ -349,16 +344,16 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = len(prompt) // init_latents.shape[0]
|
||||
init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
|
||||
elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
|
||||
|
||||
@@ -228,7 +228,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
@@ -241,7 +241,6 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -249,20 +248,20 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
image (`nd.ndarray` or `PIL.Image.Image`):
|
||||
init_image (`nd.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`nd.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -302,10 +301,6 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
@@ -330,8 +325,8 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
@@ -343,10 +338,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
image = image.astype(latents_dtype)
|
||||
init_image = init_image.astype(latents_dtype)
|
||||
|
||||
# encode the init image into latents and scale the latents
|
||||
init_latents = self.vae_encoder(sample=image)[0]
|
||||
init_latents = self.vae_encoder(sample=init_image)[0]
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
# Expand init_latents for batch_size and num_images_per_prompt
|
||||
@@ -361,7 +356,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
|
||||
# check sizes
|
||||
if not mask.shape == init_latents.shape:
|
||||
raise ValueError("The mask and image should be the same size!")
|
||||
raise ValueError("The mask and init_image should be the same size!")
|
||||
|
||||
# get the original timestep using init_timestep
|
||||
offset = self.scheduler.config.get("steps_offset", 0)
|
||||
|
||||
@@ -165,6 +165,24 @@ class StableDiffusionPipeline(DiffusionPipeline):
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
@@ -557,7 +575,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -134,6 +134,26 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
self.register_to_config(requires_safety_checker=requires_safety_checker)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
@@ -440,7 +460,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -254,6 +254,26 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return torch.device(module._hf_hook.execution_device)
|
||||
return self.device
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
||||
def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
|
||||
r"""
|
||||
@@ -424,9 +444,9 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
init_image = init_image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -434,16 +454,16 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
||||
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
|
||||
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
|
||||
@@ -461,7 +481,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -473,7 +493,6 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -481,15 +500,15 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -530,10 +549,6 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -551,8 +566,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
|
||||
# 5. set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
@@ -561,7 +576,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
# 6. Prepare latent variables
|
||||
latents = self.prepare_latents(
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
|
||||
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
@@ -587,7 +602,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -300,6 +300,26 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
# fix by only offloading self.safety_checker for now
|
||||
cpu_offload(self.safety_checker.vision_model, device)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
|
||||
def _execution_device(self):
|
||||
@@ -701,7 +721,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -248,6 +248,26 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
# fix by only offloading self.safety_checker for now
|
||||
cpu_offload(self.safety_checker.vision_model, device)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
|
||||
def _execution_device(self):
|
||||
@@ -439,9 +459,9 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
|
||||
image = image.to(device=self.device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
|
||||
init_image = init_image.to(device=self.device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -459,7 +479,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
@@ -472,7 +492,6 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -480,19 +499,19 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
||||
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
||||
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
|
||||
that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
|
||||
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
||||
the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
|
||||
@@ -533,10 +552,6 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -554,8 +569,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image and mask
|
||||
if not isinstance(image, torch.FloatTensor):
|
||||
image = preprocess_image(image)
|
||||
if not isinstance(init_image, torch.FloatTensor):
|
||||
init_image = preprocess_image(init_image)
|
||||
|
||||
if not isinstance(mask_image, torch.FloatTensor):
|
||||
mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
|
||||
@@ -568,7 +583,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
# 6. Prepare latent variables
|
||||
# encode the init image into latents and scale the latents
|
||||
latents, init_latents_orig, noise = self.prepare_latents(
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
|
||||
# 7. Prepare mask latent
|
||||
@@ -602,7 +617,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
latents = (init_latents_proper * mask) + (latents * (1 - mask))
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -143,6 +143,26 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
|
||||
if cpu_offloaded_model is not None:
|
||||
cpu_offload(cpu_offloaded_model, device)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
@property
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
|
||||
def _execution_device(self):
|
||||
@@ -515,7 +535,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -182,6 +182,24 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
|
||||
"""
|
||||
self._safety_text_concept = concept
|
||||
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
Enable sliced attention computation.
|
||||
@@ -711,7 +729,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
|
||||
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
||||
|
||||
# call the callback, if provided
|
||||
if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
if callback is not None and i % callback_steps == 0:
|
||||
callback(i, t, latents)
|
||||
|
||||
@@ -330,6 +330,17 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_attention_slice(slice_size)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for block in self.down_blocks:
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
for block in self.up_blocks:
|
||||
if hasattr(block, "attentions") and block.attentions is not None:
|
||||
block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat, UpBlockFlat)):
|
||||
module.gradient_checkpointing = value
|
||||
@@ -377,14 +388,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
|
||||
timesteps = timestep
|
||||
if not torch.is_tensor(timesteps):
|
||||
# TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
|
||||
# This would be a good case for the `match` statement (Python 3.10+)
|
||||
is_mps = sample.device.type == "mps"
|
||||
if torch.is_floating_point(timesteps):
|
||||
dtype = torch.float32 if is_mps else torch.float64
|
||||
else:
|
||||
dtype = torch.int32 if is_mps else torch.int64
|
||||
timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
|
||||
elif len(timesteps.shape) == 0:
|
||||
timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
|
||||
elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
|
||||
timesteps = timesteps[None].to(sample.device)
|
||||
|
||||
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
||||
@@ -756,6 +761,10 @@ class CrossAttnDownBlockFlat(nn.Module):
|
||||
for attn in self.attentions:
|
||||
attn._set_attention_slice(slice_size)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for attn in self.attentions:
|
||||
attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
|
||||
output_states = ()
|
||||
|
||||
@@ -967,6 +976,10 @@ class CrossAttnUpBlockFlat(nn.Module):
|
||||
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for attn in self.attentions:
|
||||
attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
@@ -1109,6 +1122,10 @@ class UNetMidBlockFlatCrossAttn(nn.Module):
|
||||
for attn in self.attentions:
|
||||
attn._set_attention_slice(slice_size)
|
||||
|
||||
def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
|
||||
for attn in self.attentions:
|
||||
attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
|
||||
|
||||
def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
|
||||
hidden_states = self.resnets[0](hidden_states, temb)
|
||||
for attn, resnet in zip(self.attentions, self.resnets[1:]):
|
||||
|
||||
@@ -147,6 +147,26 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
|
||||
|
||||
self.image_unet.register_to_config(dual_cross_attention=False)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.image_unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention with unet->image_unet
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.image_unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing with unet->image_unet
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
|
||||
@@ -73,6 +73,26 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
)
|
||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.image_unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention with unet->image_unet
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.image_unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing with unet->image_unet
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
|
||||
@@ -98,6 +98,26 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
||||
def remove_unused_weights(self):
|
||||
self.register_modules(text_unet=None)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
|
||||
def enable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Enable memory efficient attention as implemented in xformers.
|
||||
|
||||
When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
|
||||
time. Speed up at training time is not guaranteed.
|
||||
|
||||
Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
|
||||
is used.
|
||||
"""
|
||||
self.image_unet.set_use_memory_efficient_attention_xformers(True)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention with unet->image_unet
|
||||
def disable_xformers_memory_efficient_attention(self):
|
||||
r"""
|
||||
Disable memory efficient attention as implemented in xformers.
|
||||
"""
|
||||
self.image_unet.set_use_memory_efficient_attention_xformers(False)
|
||||
|
||||
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing with unet->image_unet
|
||||
def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
|
||||
r"""
|
||||
|
||||
@@ -22,8 +22,10 @@ if is_torch_available():
|
||||
from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
|
||||
from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
|
||||
from .scheduling_euler_discrete import EulerDiscreteScheduler
|
||||
from .scheduling_heun import HeunDiscreteScheduler
|
||||
from .scheduling_heun_discrete import HeunDiscreteScheduler
|
||||
from .scheduling_ipndm import IPNDMScheduler
|
||||
from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
|
||||
from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
|
||||
from .scheduling_karras_ve import KarrasVeScheduler
|
||||
from .scheduling_pndm import PNDMScheduler
|
||||
from .scheduling_repaint import RePaintScheduler
|
||||
|
||||
@@ -134,7 +134,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -125,7 +125,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
@@ -255,7 +255,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
new_config = dict(self.config)
|
||||
new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
|
||||
@@ -280,12 +280,10 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
|
||||
elif self.config.prediction_type == "sample":
|
||||
pred_original_sample = model_output
|
||||
elif self.config.prediction_type == "v_prediction":
|
||||
pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
|
||||
else:
|
||||
raise ValueError(
|
||||
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
|
||||
" `v_prediction` for the DDPMScheduler."
|
||||
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
|
||||
" for the DDPMScheduler."
|
||||
)
|
||||
|
||||
# 3. Clip "predicted x_0"
|
||||
|
||||
@@ -132,7 +132,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
@@ -239,7 +239,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
new_config = dict(self.config)
|
||||
new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
|
||||
|
||||
@@ -142,7 +142,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DPMSolverMultistepScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -177,7 +177,7 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDPMSolverMultistepScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -78,7 +78,6 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
prediction_type: str = "epsilon",
|
||||
):
|
||||
if trained_betas is not None:
|
||||
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
|
||||
@@ -203,16 +202,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
sigma = self.sigmas[step_index]
|
||||
|
||||
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
||||
if self.config.prediction_type == "epsilon":
|
||||
pred_original_sample = sample - sigma * model_output
|
||||
elif self.config.prediction_type == "v_prediction":
|
||||
# * c_out + input * c_skip
|
||||
pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
|
||||
else:
|
||||
raise ValueError(
|
||||
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
|
||||
)
|
||||
|
||||
pred_original_sample = sample - sigma * model_output
|
||||
sigma_from = self.sigmas[step_index]
|
||||
sigma_to = self.sigmas[step_index + 1]
|
||||
sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
|
||||
|
||||
@@ -24,14 +24,16 @@ from .scheduling_utils import SchedulerMixin, SchedulerOutput
|
||||
|
||||
class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
Args:
|
||||
Implements Algorithm 2 (Heun steps) from Karras et al. (2022). for discrete beta schedules. Based on the original
|
||||
k-diffusion implementation by Katherine Crowson:
|
||||
https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L90
|
||||
|
||||
[`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
|
||||
function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
|
||||
[`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
|
||||
[`~ConfigMixin.from_config`] functions.
|
||||
[`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
|
||||
[`~SchedulerMixin.from_pretrained`] functions.
|
||||
|
||||
Args:
|
||||
num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
|
||||
starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
|
||||
the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
@@ -54,7 +56,6 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
beta_end: float = 0.012,
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
prediction_type: str = "epsilon",
|
||||
):
|
||||
if trained_betas is not None:
|
||||
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
|
||||
@@ -77,7 +78,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def index_for_timestep(self, timestep):
|
||||
indices = (self.timesteps == timestep).nonzero()
|
||||
if self.state_in_first_order:
|
||||
pos = 0 if indices.shape[0] < 2 else 1
|
||||
pos = -1
|
||||
else:
|
||||
pos = 0
|
||||
return indices[pos].item()
|
||||
@@ -132,7 +133,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.init_noise_sigma = self.sigmas.max()
|
||||
|
||||
timesteps = torch.from_numpy(timesteps)
|
||||
timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2), timesteps[-1:]])
|
||||
timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
# mps does not support float64
|
||||
@@ -185,18 +186,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
sigma_hat = sigma * (gamma + 1) # Note: sigma_hat == sigma for now
|
||||
|
||||
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
||||
if self.config.prediction_type == "epsilon":
|
||||
sigma_input = sigma_hat if self.state_in_first_order else sigma_next
|
||||
pred_original_sample = sample - sigma_input * model_output
|
||||
elif self.config.prediction_type == "v_prediction":
|
||||
sigma_input = sigma_hat if self.state_in_first_order else sigma_next
|
||||
pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
|
||||
sample / (sigma_input**2 + 1)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
|
||||
)
|
||||
pred_original_sample = sample - sigma_hat * model_output
|
||||
|
||||
if self.state_in_first_order:
|
||||
# 2. Convert to an ODE derivative
|
||||
@@ -210,7 +200,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sample = sample
|
||||
else:
|
||||
# 2. 2nd order / Heun's method
|
||||
derivative = (sample - pred_original_sample) / sigma_next
|
||||
derivative = (sample - pred_original_sample) / sigma_hat
|
||||
derivative = (self.prev_derivative + derivative) / 2
|
||||
|
||||
# 3. Retrieve 1st order derivative
|
||||
@@ -0,0 +1,268 @@
|
||||
# Copyright 2022 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS
|
||||
from .scheduling_utils import SchedulerMixin, SchedulerOutput
|
||||
|
||||
|
||||
class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
Scheduler created by @crowsonkb in [k_diffusion](https://github.com/crowsonkb/k-diffusion), see:
|
||||
https://github.com/crowsonkb/k-diffusion/blob/5b3af030dd83e0297272d861c19477735d0317ec/k_diffusion/sampling.py#L188
|
||||
|
||||
Scheduler inspired by DPM-Solver-2 and Algorthim 2 from Karras et al. (2022).
|
||||
|
||||
[`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
|
||||
function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
|
||||
[`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
|
||||
[`~SchedulerMixin.from_pretrained`] functions.
|
||||
|
||||
Args:
|
||||
num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
|
||||
starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
|
||||
the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`linear` or `scaled_linear`.
|
||||
trained_betas (`np.ndarray`, optional):
|
||||
option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
|
||||
options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
|
||||
`fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
|
||||
tensor_format (`str`): whether the scheduler expects pytorch or numpy arrays.
|
||||
"""
|
||||
|
||||
_compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
|
||||
order = 2
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.00085, # sensible defaults
|
||||
beta_end: float = 0.012,
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
|
||||
elif beta_schedule == "linear":
|
||||
self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
|
||||
elif beta_schedule == "scaled_linear":
|
||||
# this schedule is very specific to the latent diffusion model.
|
||||
self.betas = (
|
||||
torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
|
||||
|
||||
self.alphas = 1.0 - self.betas
|
||||
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
|
||||
|
||||
# set all values
|
||||
self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
|
||||
|
||||
def index_for_timestep(self, timestep):
|
||||
indices = (self.timesteps == timestep).nonzero()
|
||||
if self.state_in_first_order:
|
||||
pos = -1
|
||||
else:
|
||||
pos = 0
|
||||
return indices[pos].item()
|
||||
|
||||
def scale_model_input(
|
||||
self,
|
||||
sample: torch.FloatTensor,
|
||||
timestep: Union[float, torch.FloatTensor],
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Args:
|
||||
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
|
||||
current timestep.
|
||||
sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep
|
||||
Returns:
|
||||
`torch.FloatTensor`: scaled input sample
|
||||
"""
|
||||
step_index = self.index_for_timestep(timestep)
|
||||
|
||||
sigma = self.sigmas[step_index]
|
||||
sample = sample / ((sigma**2 + 1) ** 0.5)
|
||||
return sample
|
||||
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: int,
|
||||
device: Union[str, torch.device] = None,
|
||||
num_train_timesteps: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
the number of diffusion steps used when generating samples with a pre-trained model.
|
||||
device (`str` or `torch.device`, optional):
|
||||
the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
"""
|
||||
self.num_inference_steps = num_inference_steps
|
||||
|
||||
num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
|
||||
|
||||
timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
|
||||
|
||||
sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
|
||||
self.log_sigmas = torch.from_numpy(np.log(sigmas)).to(device)
|
||||
|
||||
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
|
||||
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
|
||||
sigmas = torch.from_numpy(sigmas).to(device=device)
|
||||
|
||||
# compute up and down sigmas
|
||||
sigmas_next = sigmas.roll(-1)
|
||||
sigmas_next[-1] = 0.0
|
||||
sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas**2) ** 0.5
|
||||
sigmas_down = (sigmas_next**2 - sigmas_up**2) ** 0.5
|
||||
sigmas_down[-1] = 0.0
|
||||
|
||||
self.sigmas = torch.cat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
|
||||
self.sigmas_up = torch.cat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]])
|
||||
self.sigmas_down = torch.cat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]])
|
||||
|
||||
# standard deviation of the initial noise distribution
|
||||
self.init_noise_sigma = self.sigmas.max()
|
||||
|
||||
timesteps = torch.from_numpy(timesteps)
|
||||
timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
# mps does not support float64
|
||||
self.timesteps = timesteps.to(device, dtype=torch.float32)
|
||||
else:
|
||||
self.timesteps = timesteps
|
||||
|
||||
self.sample = None
|
||||
|
||||
@property
|
||||
def state_in_first_order(self):
|
||||
return self.sample is None
|
||||
|
||||
def step(
|
||||
self,
|
||||
model_output: Union[torch.FloatTensor, np.ndarray],
|
||||
timestep: Union[float, torch.FloatTensor],
|
||||
sample: Union[torch.FloatTensor, np.ndarray],
|
||||
generator: Optional[torch.Generator] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[SchedulerOutput, Tuple]:
|
||||
"""
|
||||
Args:
|
||||
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
|
||||
process from the learned model outputs (most often the predicted noise).
|
||||
model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. timestep
|
||||
(`int`): current discrete timestep in the diffusion chain. sample (`torch.FloatTensor` or `np.ndarray`):
|
||||
current instance of sample being created by diffusion process.
|
||||
return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
|
||||
Returns:
|
||||
[`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
|
||||
[`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
|
||||
returning a tuple, the first element is the sample tensor.
|
||||
"""
|
||||
step_index = self.index_for_timestep(timestep)
|
||||
|
||||
if self.state_in_first_order:
|
||||
sigma = self.sigmas[step_index]
|
||||
sigma_next = self.sigmas[step_index + 1]
|
||||
else:
|
||||
# 2nd order / KPDM2's method
|
||||
sigma = self.sigmas[step_index - 1]
|
||||
sigma_next = self.sigmas[step_index]
|
||||
sigma_up = self.sigmas_up[step_index - 1]
|
||||
sigma_down = self.sigmas_down[step_index - 1]
|
||||
|
||||
# currently only gamma=0 is supported. This usually works best anyways.
|
||||
# We can support gamma in the future but then need to scale the timestep before
|
||||
# passing it to the model which requires a change in API
|
||||
gamma = 0
|
||||
sigma_hat = sigma * (gamma + 1) # Note: sigma_hat == sigma for now
|
||||
|
||||
device = model_output.device
|
||||
if device.type == "mps":
|
||||
# randn does not work reproducibly on mps
|
||||
noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
|
||||
device
|
||||
)
|
||||
else:
|
||||
noise = torch.randn(model_output.shape, dtype=model_output.dtype, device=device, generator=generator).to(
|
||||
device
|
||||
)
|
||||
|
||||
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
||||
pred_original_sample = sample - sigma_hat * model_output
|
||||
|
||||
if self.state_in_first_order:
|
||||
# 2. Convert to an ODE derivative
|
||||
derivative = (sample - pred_original_sample) / sigma_hat
|
||||
# 3. 1st order derivative
|
||||
dt = sigma_next - sigma_hat
|
||||
|
||||
# store for 2nd order step
|
||||
self.sample = sample
|
||||
self.dt = dt
|
||||
prev_sample = sample + derivative * dt
|
||||
else:
|
||||
# DPM-Solver-2
|
||||
derivative = (sample - pred_original_sample) / sigma_hat
|
||||
dt = sigma_down - sigma_hat
|
||||
|
||||
sample = self.sample
|
||||
self.sample = None
|
||||
|
||||
prev_sample = sample + derivative * dt
|
||||
prev_sample = prev_sample + noise * sigma_up
|
||||
|
||||
if not return_dict:
|
||||
return (prev_sample,)
|
||||
|
||||
return SchedulerOutput(prev_sample=prev_sample)
|
||||
|
||||
def add_noise(
|
||||
self,
|
||||
original_samples: torch.FloatTensor,
|
||||
noise: torch.FloatTensor,
|
||||
timesteps: torch.FloatTensor,
|
||||
) -> torch.FloatTensor:
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
# mps does not support float64
|
||||
self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
|
||||
timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
|
||||
else:
|
||||
self.timesteps = self.timesteps.to(original_samples.device)
|
||||
timesteps = timesteps.to(original_samples.device)
|
||||
|
||||
step_indices = [self.index_for_timestep(t) for t in timesteps]
|
||||
|
||||
sigma = self.sigmas[step_indices].flatten()
|
||||
while len(sigma.shape) < len(original_samples.shape):
|
||||
sigma = sigma.unsqueeze(-1)
|
||||
|
||||
noisy_samples = original_samples + noise * sigma
|
||||
return noisy_samples
|
||||
|
||||
def __len__(self):
|
||||
return self.config.num_train_timesteps
|
||||
283
src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
Normal file
283
src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
Normal file
@@ -0,0 +1,283 @@
|
||||
# Copyright 2022 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from ..configuration_utils import ConfigMixin, register_to_config
|
||||
from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS
|
||||
from .scheduling_utils import SchedulerMixin, SchedulerOutput
|
||||
|
||||
|
||||
class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
Scheduler created by @crowsonkb in [k_diffusion](https://github.com/crowsonkb/k-diffusion), see:
|
||||
https://github.com/crowsonkb/k-diffusion/blob/5b3af030dd83e0297272d861c19477735d0317ec/k_diffusion/sampling.py#L188
|
||||
|
||||
Scheduler inspired by DPM-Solver-2 and Algorthim 2 from Karras et al. (2022).
|
||||
|
||||
[`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
|
||||
function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
|
||||
[`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
|
||||
[`~SchedulerMixin.from_pretrained`] functions.
|
||||
|
||||
Args:
|
||||
num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
|
||||
starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
|
||||
the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`linear` or `scaled_linear`.
|
||||
trained_betas (`np.ndarray`, optional):
|
||||
option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
|
||||
options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
|
||||
`fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
|
||||
tensor_format (`str`): whether the scheduler expects pytorch or numpy arrays.
|
||||
"""
|
||||
|
||||
_compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
|
||||
order = 2
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.00085, # sensible defaults
|
||||
beta_end: float = 0.012,
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
|
||||
elif beta_schedule == "linear":
|
||||
self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
|
||||
elif beta_schedule == "scaled_linear":
|
||||
# this schedule is very specific to the latent diffusion model.
|
||||
self.betas = (
|
||||
torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
|
||||
|
||||
self.alphas = 1.0 - self.betas
|
||||
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
|
||||
|
||||
# set all values
|
||||
self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
|
||||
|
||||
def index_for_timestep(self, timestep):
|
||||
indices = (self.timesteps == timestep).nonzero()
|
||||
if self.state_in_first_order:
|
||||
pos = -1
|
||||
else:
|
||||
pos = 0
|
||||
return indices[pos].item()
|
||||
|
||||
def scale_model_input(
|
||||
self,
|
||||
sample: torch.FloatTensor,
|
||||
timestep: Union[float, torch.FloatTensor],
|
||||
) -> torch.FloatTensor:
|
||||
"""
|
||||
Args:
|
||||
Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
|
||||
current timestep.
|
||||
sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep
|
||||
Returns:
|
||||
`torch.FloatTensor`: scaled input sample
|
||||
"""
|
||||
step_index = self.index_for_timestep(timestep)
|
||||
|
||||
if self.state_in_first_order:
|
||||
sigma = self.sigmas[step_index]
|
||||
else:
|
||||
sigma = self.sigmas_interpol[step_index]
|
||||
|
||||
sample = sample / ((sigma**2 + 1) ** 0.5)
|
||||
return sample
|
||||
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: int,
|
||||
device: Union[str, torch.device] = None,
|
||||
num_train_timesteps: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
the number of diffusion steps used when generating samples with a pre-trained model.
|
||||
device (`str` or `torch.device`, optional):
|
||||
the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
"""
|
||||
self.num_inference_steps = num_inference_steps
|
||||
|
||||
num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
|
||||
|
||||
timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
|
||||
|
||||
sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
|
||||
self.log_sigmas = torch.from_numpy(np.log(sigmas)).to(device)
|
||||
|
||||
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
|
||||
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
|
||||
sigmas = torch.from_numpy(sigmas).to(device=device)
|
||||
|
||||
# interpolate sigmas
|
||||
sigmas_interpol = sigmas.log().lerp(sigmas.roll(1).log(), 0.5).exp()
|
||||
|
||||
self.sigmas = torch.cat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
|
||||
self.sigmas_interpol = torch.cat(
|
||||
[sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]]
|
||||
)
|
||||
|
||||
# standard deviation of the initial noise distribution
|
||||
self.init_noise_sigma = self.sigmas.max()
|
||||
|
||||
timesteps = torch.from_numpy(timesteps).to(device)
|
||||
|
||||
# interpolate timesteps
|
||||
timesteps_interpol = self.sigma_to_t(sigmas_interpol).to(device)
|
||||
interleaved_timesteps = torch.stack((timesteps_interpol[1:-1, None], timesteps[1:, None]), dim=-1).flatten()
|
||||
timesteps = torch.cat([timesteps[:1], interleaved_timesteps])
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
# mps does not support float64
|
||||
self.timesteps = timesteps.to(torch.float32)
|
||||
else:
|
||||
self.timesteps = timesteps
|
||||
|
||||
self.sample = None
|
||||
|
||||
def sigma_to_t(self, sigma):
|
||||
# get log sigma
|
||||
log_sigma = sigma.log()
|
||||
|
||||
# get distribution
|
||||
dists = log_sigma - self.log_sigmas[:, None]
|
||||
|
||||
# get sigmas range
|
||||
low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.log_sigmas.shape[0] - 2)
|
||||
high_idx = low_idx + 1
|
||||
|
||||
low = self.log_sigmas[low_idx]
|
||||
high = self.log_sigmas[high_idx]
|
||||
|
||||
# interpolate sigmas
|
||||
w = (low - log_sigma) / (low - high)
|
||||
w = w.clamp(0, 1)
|
||||
|
||||
# transform interpolation to time range
|
||||
t = (1 - w) * low_idx + w * high_idx
|
||||
t = t.view(sigma.shape)
|
||||
return t
|
||||
|
||||
@property
|
||||
def state_in_first_order(self):
|
||||
return self.sample is None
|
||||
|
||||
def step(
|
||||
self,
|
||||
model_output: Union[torch.FloatTensor, np.ndarray],
|
||||
timestep: Union[float, torch.FloatTensor],
|
||||
sample: Union[torch.FloatTensor, np.ndarray],
|
||||
return_dict: bool = True,
|
||||
) -> Union[SchedulerOutput, Tuple]:
|
||||
"""
|
||||
Args:
|
||||
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
|
||||
process from the learned model outputs (most often the predicted noise).
|
||||
model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model. timestep
|
||||
(`int`): current discrete timestep in the diffusion chain. sample (`torch.FloatTensor` or `np.ndarray`):
|
||||
current instance of sample being created by diffusion process.
|
||||
return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
|
||||
Returns:
|
||||
[`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
|
||||
[`~schedulers.scheduling_utils.SchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
|
||||
returning a tuple, the first element is the sample tensor.
|
||||
"""
|
||||
step_index = self.index_for_timestep(timestep)
|
||||
|
||||
if self.state_in_first_order:
|
||||
sigma = self.sigmas[step_index]
|
||||
sigma_interpol = self.sigmas_interpol[step_index + 1]
|
||||
sigma_next = self.sigmas[step_index + 1]
|
||||
else:
|
||||
# 2nd order / KDPM2's method
|
||||
sigma = self.sigmas[step_index - 1]
|
||||
sigma_interpol = self.sigmas_interpol[step_index]
|
||||
sigma_next = self.sigmas[step_index]
|
||||
|
||||
# currently only gamma=0 is supported. This usually works best anyways.
|
||||
# We can support gamma in the future but then need to scale the timestep before
|
||||
# passing it to the model which requires a change in API
|
||||
gamma = 0
|
||||
sigma_hat = sigma * (gamma + 1) # Note: sigma_hat == sigma for now
|
||||
|
||||
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
||||
|
||||
if self.state_in_first_order:
|
||||
pred_original_sample = sample - sigma_hat * model_output
|
||||
# 2. Convert to an ODE derivative
|
||||
derivative = (sample - pred_original_sample) / sigma_hat
|
||||
# 3. 1st order derivative
|
||||
dt = sigma_interpol - sigma_hat
|
||||
|
||||
# store for 2nd order step
|
||||
self.sample = sample
|
||||
else:
|
||||
# DPM-Solver-2
|
||||
pred_original_sample = sample - sigma_interpol * model_output
|
||||
derivative = (sample - pred_original_sample) / sigma_interpol
|
||||
|
||||
dt = sigma_next - sigma_hat
|
||||
|
||||
sample = self.sample
|
||||
self.sample = None
|
||||
|
||||
prev_sample = sample + derivative * dt
|
||||
|
||||
if not return_dict:
|
||||
return (prev_sample,)
|
||||
|
||||
return SchedulerOutput(prev_sample=prev_sample)
|
||||
|
||||
def add_noise(
|
||||
self,
|
||||
original_samples: torch.FloatTensor,
|
||||
noise: torch.FloatTensor,
|
||||
timesteps: torch.FloatTensor,
|
||||
) -> torch.FloatTensor:
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
# mps does not support float64
|
||||
self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
|
||||
timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
|
||||
else:
|
||||
self.timesteps = self.timesteps.to(original_samples.device)
|
||||
timesteps = timesteps.to(original_samples.device)
|
||||
|
||||
step_indices = [self.index_for_timestep(t) for t in timesteps]
|
||||
|
||||
sigma = self.sigmas[step_indices].flatten()
|
||||
while len(sigma.shape) < len(original_samples.shape):
|
||||
sigma = sigma.unsqueeze(-1)
|
||||
|
||||
noisy_samples = original_samples + noise * sigma
|
||||
return noisy_samples
|
||||
|
||||
def __len__(self):
|
||||
return self.config.num_train_timesteps
|
||||
@@ -78,7 +78,6 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
prediction_type: str = "epsilon",
|
||||
):
|
||||
if trained_betas is not None:
|
||||
self.betas = torch.tensor(trained_betas, dtype=torch.float32)
|
||||
@@ -216,15 +215,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
sigma = self.sigmas[step_index]
|
||||
|
||||
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
||||
if self.config.prediction_type == "epsilon":
|
||||
pred_original_sample = sample - sigma * model_output
|
||||
elif self.config.prediction_type == "v_prediction":
|
||||
# * c_out + input * c_skip
|
||||
pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
|
||||
else:
|
||||
raise ValueError(
|
||||
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
|
||||
)
|
||||
pred_original_sample = sample - sigma * model_output
|
||||
|
||||
# 2. Convert to an ODE derivative
|
||||
derivative = (sample - pred_original_sample) / sigma
|
||||
|
||||
@@ -102,7 +102,6 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
skip_prk_steps: bool = False,
|
||||
set_alpha_to_one: bool = False,
|
||||
prediction_type: str = "epsilon",
|
||||
steps_offset: int = 0,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
@@ -369,13 +368,6 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
beta_prod_t = 1 - alpha_prod_t
|
||||
beta_prod_t_prev = 1 - alpha_prod_t_prev
|
||||
|
||||
if self.config.prediction_type == "v_prediction":
|
||||
model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
|
||||
elif self.config.prediction_type != "epsilon":
|
||||
raise ValueError(
|
||||
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
|
||||
)
|
||||
|
||||
# corresponds to (α_(t−δ) - α_t) divided by
|
||||
# denominator of x_t in formula (9) and plus 1
|
||||
# Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
|
||||
|
||||
@@ -32,7 +32,7 @@ def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn
|
||||
|
||||
if warning is not None:
|
||||
warning = warning + " " if standard_warn else ""
|
||||
warnings.warn(warning + message, FutureWarning, stacklevel=2)
|
||||
warnings.warn(warning + message, FutureWarning)
|
||||
|
||||
if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
|
||||
call_frame = inspect.getouterframes(inspect.currentframe())[1]
|
||||
|
||||
@@ -407,6 +407,36 @@ class KarrasVeScheduler(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class KDPM2AncestralDiscreteScheduler(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class KDPM2DiscreteScheduler(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class PNDMScheduler(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
@@ -141,7 +141,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
@@ -153,7 +153,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
@@ -204,7 +204,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
).images
|
||||
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
@@ -243,7 +243,7 @@ class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
|
||||
@@ -69,7 +69,7 @@ class DDPMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_inference_deprecated_predict_epsilon(self):
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
unet = self.dummy_uncond_unet
|
||||
scheduler = DDPMScheduler(predict_epsilon=False)
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ class LDMSuperResolutionPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
init_image = self.dummy_image.to(device)
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
|
||||
image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
@@ -124,7 +124,7 @@ class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
|
||||
ldm.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
|
||||
image = ldm(init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
|
||||
@@ -186,7 +186,7 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
source_prompt=source_prompt,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
guidance_scale=3,
|
||||
@@ -244,7 +244,7 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
source_prompt=source_prompt,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
guidance_scale=3,
|
||||
@@ -297,7 +297,7 @@ class CycleDiffusionPipelineIntegrationTests(unittest.TestCase):
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
@@ -336,7 +336,7 @@ class CycleDiffusionPipelineIntegrationTests(unittest.TestCase):
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
|
||||
@@ -72,7 +72,7 @@ class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = np.random.RandomState(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=10,
|
||||
@@ -110,7 +110,7 @@ class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = np.random.RandomState(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=10,
|
||||
|
||||
@@ -80,7 +80,7 @@ class StableDiffusionOnnxInpaintLegacyPipelineIntegrationTests(unittest.TestCase
|
||||
generator = np.random.RandomState(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
|
||||
@@ -188,7 +188,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
@@ -200,7 +200,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
@@ -245,7 +245,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
)
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
@@ -285,7 +285,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
@@ -328,7 +328,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
)
|
||||
image = output.images
|
||||
|
||||
@@ -339,7 +339,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
return_dict=False,
|
||||
)
|
||||
image_from_tuple = output[0]
|
||||
@@ -382,7 +382,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
).images
|
||||
|
||||
assert images.shape == (1, 32, 32, 3)
|
||||
@@ -393,7 +393,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
).images
|
||||
|
||||
assert images.shape == (batch_size, 32, 32, 3)
|
||||
@@ -404,7 +404,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
|
||||
@@ -416,7 +416,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
|
||||
@@ -458,7 +458,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
).images
|
||||
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
@@ -497,7 +497,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
@@ -535,7 +535,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
@@ -572,7 +572,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
@@ -626,7 +626,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
with torch.autocast(torch_device):
|
||||
pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=7.5,
|
||||
@@ -663,7 +663,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
_ = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
|
||||
@@ -191,7 +191,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
)
|
||||
|
||||
@@ -204,7 +204,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -252,7 +252,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
)
|
||||
|
||||
@@ -295,7 +295,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
).images
|
||||
|
||||
@@ -307,7 +307,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
).images
|
||||
|
||||
@@ -319,7 +319,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
@@ -332,7 +332,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
@@ -374,7 +374,7 @@ class StableDiffusionInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
@@ -416,7 +416,7 @@ class StableDiffusionInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
@@ -474,7 +474,7 @@ class StableDiffusionInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
|
||||
with torch.autocast(torch_device):
|
||||
pipe(
|
||||
prompt=prompt,
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
num_inference_steps=50,
|
||||
|
||||
@@ -203,7 +203,7 @@ class ConfigTester(unittest.TestCase):
|
||||
ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
|
||||
|
||||
with CaptureLogger(logger) as cap_logger:
|
||||
deprecate("remove this case", "0.11.0", "remove")
|
||||
deprecate("remove this case", "0.10.0", "remove")
|
||||
ddpm_3 = DDPMScheduler.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch",
|
||||
subfolder="scheduler",
|
||||
|
||||
@@ -95,35 +95,6 @@ class DownloadTests(unittest.TestCase):
|
||||
# We need to never convert this tiny model to safetensors for this test to pass
|
||||
assert not any(f.endswith(".safetensors") for f in files)
|
||||
|
||||
def test_returned_cached_folder(self):
|
||||
prompt = "hello"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
|
||||
)
|
||||
_, local_path = StableDiffusionPipeline.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, return_cached_folder=True
|
||||
)
|
||||
pipe_2 = StableDiffusionPipeline.from_pretrained(local_path)
|
||||
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe_2 = pipe.to(torch_device)
|
||||
if torch_device == "mps":
|
||||
# device type MPS is not supported for torch.Generator() api.
|
||||
generator = torch.manual_seed(0)
|
||||
else:
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
|
||||
out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
|
||||
|
||||
if torch_device == "mps":
|
||||
# device type MPS is not supported for torch.Generator() api.
|
||||
generator = torch.manual_seed(0)
|
||||
else:
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
|
||||
|
||||
assert np.max(np.abs(out - out_2)) < 1e-3
|
||||
|
||||
def test_download_safetensors(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
# pipeline has Flax weights
|
||||
@@ -440,7 +411,7 @@ class PipelineFastTests(unittest.TestCase):
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
mask_image=mask_image,
|
||||
).images
|
||||
image_img2img = img2img(
|
||||
@@ -448,7 +419,7 @@ class PipelineFastTests(unittest.TestCase):
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
image=init_image,
|
||||
init_image=init_image,
|
||||
).images
|
||||
image_text2img = text2img(
|
||||
[prompt],
|
||||
|
||||
@@ -32,6 +32,8 @@ from diffusers import (
|
||||
EulerDiscreteScheduler,
|
||||
HeunDiscreteScheduler,
|
||||
IPNDMScheduler,
|
||||
KDPM2AncestralDiscreteScheduler,
|
||||
KDPM2DiscreteScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
ScoreSdeVeScheduler,
|
||||
@@ -635,16 +637,16 @@ class DDPMSchedulerTest(SchedulerCommonTest):
|
||||
self.check_over_configs(clip_sample=clip_sample)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "sample", "v_prediction"]:
|
||||
for prediction_type in ["epsilon", "sample"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_deprecated_predict_epsilon(self):
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
for predict_epsilon in [True, False]:
|
||||
self.check_over_configs(predict_epsilon=predict_epsilon)
|
||||
|
||||
def test_deprecated_epsilon(self):
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
|
||||
@@ -711,37 +713,6 @@ class DDPMSchedulerTest(SchedulerCommonTest):
|
||||
assert abs(result_sum.item() - 258.9070) < 1e-2
|
||||
assert abs(result_mean.item() - 0.3374) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
num_trained_timesteps = len(scheduler)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
for t in reversed(range(num_trained_timesteps)):
|
||||
# 1. predict noise residual
|
||||
residual = model(sample, t)
|
||||
|
||||
# 2. predict previous mean of sample x_t-1
|
||||
pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
|
||||
|
||||
# if t > 0:
|
||||
# noise = self.dummy_sample_deter
|
||||
# variance = scheduler.get_variance(t) ** (0.5) * noise
|
||||
#
|
||||
# sample = pred_prev_sample + variance
|
||||
sample = pred_prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
assert abs(result_sum.item() - 201.9864) < 1e-2
|
||||
assert abs(result_mean.item() - 0.2630) < 1e-3
|
||||
|
||||
|
||||
class DDIMSchedulerTest(SchedulerCommonTest):
|
||||
scheduler_classes = (DDIMScheduler,)
|
||||
@@ -799,10 +770,6 @@ class DDIMSchedulerTest(SchedulerCommonTest):
|
||||
for schedule in ["linear", "squaredcos_cap_v2"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "v_prediction"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_clip_sample(self):
|
||||
for clip_sample in [True, False]:
|
||||
self.check_over_configs(clip_sample=clip_sample)
|
||||
@@ -840,15 +807,6 @@ class DDIMSchedulerTest(SchedulerCommonTest):
|
||||
assert abs(result_sum.item() - 172.0067) < 1e-2
|
||||
assert abs(result_mean.item() - 0.223967) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
sample = self.full_loop(prediction_type="v_prediction")
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
assert abs(result_sum.item() - 52.5302) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0684) < 1e-3
|
||||
|
||||
def test_full_loop_with_set_alpha_to_one(self):
|
||||
# We specify different beta, so that the first alpha is 0.99
|
||||
sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
|
||||
@@ -1015,10 +973,6 @@ class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
|
||||
solver_type=solver_type,
|
||||
)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "v_prediction"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_solver_order_and_type(self):
|
||||
for algorithm_type in ["dpmsolver", "dpmsolver++"]:
|
||||
for solver_type in ["midpoint", "heun"]:
|
||||
@@ -1052,12 +1006,6 @@ class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
|
||||
|
||||
assert abs(result_mean.item() - 0.3301) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
sample = self.full_loop(prediction_type="v_prediction")
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
assert abs(result_mean.item() - 0.2251) < 1e-3
|
||||
|
||||
def test_fp16_support(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
|
||||
@@ -1238,10 +1186,6 @@ class PNDMSchedulerTest(SchedulerCommonTest):
|
||||
for schedule in ["linear", "squaredcos_cap_v2"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "v_prediction"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_time_indices(self):
|
||||
for t in [1, 5, 10]:
|
||||
self.check_over_forward(time_step=t)
|
||||
@@ -1283,14 +1227,6 @@ class PNDMSchedulerTest(SchedulerCommonTest):
|
||||
assert abs(result_sum.item() - 198.1318) < 1e-2
|
||||
assert abs(result_mean.item() - 0.2580) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
sample = self.full_loop(prediction_type="v_prediction")
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
assert abs(result_sum.item() - 67.3986) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0878) < 1e-3
|
||||
|
||||
def test_full_loop_with_set_alpha_to_one(self):
|
||||
# We specify different beta, so that the first alpha is 0.99
|
||||
sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
|
||||
@@ -1519,10 +1455,6 @@ class LMSDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
for schedule in ["linear", "scaled_linear"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "v_prediction"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_time_indices(self):
|
||||
for t in [0, 500, 800]:
|
||||
self.check_over_forward(time_step=t)
|
||||
@@ -1551,30 +1483,6 @@ class LMSDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
assert abs(result_sum.item() - 1006.388) < 1e-2
|
||||
assert abs(result_mean.item() - 1.31) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter * scheduler.init_noise_sigma
|
||||
|
||||
for i, t in enumerate(scheduler.timesteps):
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
assert abs(result_sum.item() - 0.0017) < 1e-2
|
||||
assert abs(result_mean.item() - 2.2676e-06) < 1e-3
|
||||
|
||||
def test_full_loop_device(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
@@ -1628,10 +1536,6 @@ class EulerDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
for schedule in ["linear", "scaled_linear"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "v_prediction"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_full_loop_no_noise(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
@@ -1663,37 +1567,6 @@ class EulerDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
assert abs(result_sum.item() - 10.0807) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0131) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps)
|
||||
|
||||
if torch_device == "mps":
|
||||
# device type MPS is not supported for torch.Generator() api.
|
||||
generator = torch.manual_seed(0)
|
||||
else:
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter * scheduler.init_noise_sigma
|
||||
sample = sample.to(torch_device)
|
||||
|
||||
for i, t in enumerate(scheduler.timesteps):
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample, generator=generator)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
assert abs(result_sum.item() - 0.0002) < 1e-2
|
||||
assert abs(result_mean.item() - 2.2676e-06) < 1e-3
|
||||
|
||||
def test_full_loop_device(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
@@ -1753,10 +1626,6 @@ class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
for schedule in ["linear", "scaled_linear"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "v_prediction"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_full_loop_no_noise(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
@@ -1793,42 +1662,6 @@ class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
assert abs(result_sum.item() - 144.8084) < 1e-2
|
||||
assert abs(result_mean.item() - 0.18855) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps)
|
||||
|
||||
if torch_device == "mps":
|
||||
# device type MPS is not supported for torch.Generator() api.
|
||||
generator = torch.manual_seed(0)
|
||||
else:
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter * scheduler.init_noise_sigma
|
||||
sample = sample.to(torch_device)
|
||||
|
||||
for i, t in enumerate(scheduler.timesteps):
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample, generator=generator)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
if torch_device in ["cpu", "mps"]:
|
||||
assert abs(result_sum.item() - 108.4439) < 1e-2
|
||||
assert abs(result_mean.item() - 0.1412) < 1e-3
|
||||
else:
|
||||
# CUDA
|
||||
assert abs(result_sum.item() - 102.5807) < 1e-2
|
||||
assert abs(result_mean.item() - 0.1335) < 1e-3
|
||||
|
||||
def test_full_loop_device(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
@@ -2101,10 +1934,6 @@ class HeunDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
for schedule in ["linear", "scaled_linear"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_prediction_type(self):
|
||||
for prediction_type in ["epsilon", "v_prediction"]:
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_full_loop_no_noise(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
@@ -2135,36 +1964,6 @@ class HeunDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
assert abs(result_sum.item() - 0.1233) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0002) < 1e-3
|
||||
|
||||
def test_full_loop_with_v_prediction(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter * scheduler.init_noise_sigma
|
||||
sample = sample.to(torch_device)
|
||||
|
||||
for i, t in enumerate(scheduler.timesteps):
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
if torch_device in ["cpu", "mps"]:
|
||||
assert abs(result_sum.item() - 4.6934e-07) < 1e-2
|
||||
assert abs(result_mean.item() - 6.1112e-10) < 1e-3
|
||||
else:
|
||||
# CUDA
|
||||
assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0002) < 1e-3
|
||||
|
||||
def test_full_loop_device(self):
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
@@ -2197,3 +1996,194 @@ class HeunDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
# CUDA
|
||||
assert abs(result_sum.item() - 0.1233) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0002) < 1e-3
|
||||
|
||||
|
||||
class KDPM2DiscreteSchedulerTest(SchedulerCommonTest):
|
||||
scheduler_classes = (KDPM2DiscreteScheduler,)
|
||||
num_inference_steps = 10
|
||||
|
||||
def get_scheduler_config(self, **kwargs):
|
||||
config = {
|
||||
"num_train_timesteps": 1100,
|
||||
"beta_start": 0.0001,
|
||||
"beta_end": 0.02,
|
||||
"beta_schedule": "linear",
|
||||
}
|
||||
|
||||
config.update(**kwargs)
|
||||
return config
|
||||
|
||||
def test_timesteps(self):
|
||||
for timesteps in [10, 50, 100, 1000]:
|
||||
self.check_over_configs(num_train_timesteps=timesteps)
|
||||
|
||||
def test_betas(self):
|
||||
for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
|
||||
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
|
||||
|
||||
def test_schedules(self):
|
||||
for schedule in ["linear", "scaled_linear"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_full_loop_no_noise(self):
|
||||
if torch_device == "mps":
|
||||
return
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter * scheduler.init_noise_sigma
|
||||
sample = sample.to(torch_device)
|
||||
|
||||
for i, t in enumerate(scheduler.timesteps):
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
if torch_device in ["cpu", "mps"]:
|
||||
assert abs(result_sum.item() - 20.4125) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0266) < 1e-3
|
||||
else:
|
||||
# CUDA
|
||||
assert abs(result_sum.item() - 20.4125) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0266) < 1e-3
|
||||
|
||||
def test_full_loop_device(self):
|
||||
if torch_device == "mps":
|
||||
return
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
|
||||
|
||||
for t in scheduler.timesteps:
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
if str(torch_device).startswith("cpu"):
|
||||
# The following sum varies between 148 and 156 on mps. Why?
|
||||
assert abs(result_sum.item() - 20.4125) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0266) < 1e-3
|
||||
else:
|
||||
# CUDA
|
||||
assert abs(result_sum.item() - 20.4125) < 1e-2
|
||||
assert abs(result_mean.item() - 0.0266) < 1e-3
|
||||
|
||||
|
||||
class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest):
|
||||
scheduler_classes = (KDPM2AncestralDiscreteScheduler,)
|
||||
num_inference_steps = 10
|
||||
|
||||
def get_scheduler_config(self, **kwargs):
|
||||
config = {
|
||||
"num_train_timesteps": 1100,
|
||||
"beta_start": 0.0001,
|
||||
"beta_end": 0.02,
|
||||
"beta_schedule": "linear",
|
||||
}
|
||||
|
||||
config.update(**kwargs)
|
||||
return config
|
||||
|
||||
def test_timesteps(self):
|
||||
for timesteps in [10, 50, 100, 1000]:
|
||||
self.check_over_configs(num_train_timesteps=timesteps)
|
||||
|
||||
def test_betas(self):
|
||||
for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
|
||||
self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
|
||||
|
||||
def test_schedules(self):
|
||||
for schedule in ["linear", "scaled_linear"]:
|
||||
self.check_over_configs(beta_schedule=schedule)
|
||||
|
||||
def test_full_loop_no_noise(self):
|
||||
if torch_device == "mps":
|
||||
return
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps)
|
||||
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter * scheduler.init_noise_sigma
|
||||
sample = sample.to(torch_device)
|
||||
|
||||
for i, t in enumerate(scheduler.timesteps):
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample, generator=generator)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
if torch_device in ["cpu", "mps"]:
|
||||
assert abs(result_sum.item() - 13849.3945) < 1e-2
|
||||
assert abs(result_mean.item() - 18.0331) < 5e-3
|
||||
else:
|
||||
# CUDA
|
||||
assert abs(result_sum.item() - 13913.0449) < 1e-2
|
||||
assert abs(result_mean.item() - 18.1159) < 5e-3
|
||||
|
||||
def test_full_loop_device(self):
|
||||
if torch_device == "mps":
|
||||
return
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
scheduler = scheduler_class(**scheduler_config)
|
||||
|
||||
scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
|
||||
|
||||
if torch_device == "mps":
|
||||
# device type MPS is not supported for torch.Generator() api.
|
||||
generator = torch.manual_seed(0)
|
||||
else:
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
|
||||
model = self.dummy_model()
|
||||
sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
|
||||
|
||||
for t in scheduler.timesteps:
|
||||
sample = scheduler.scale_model_input(sample, t)
|
||||
|
||||
model_output = model(sample, t)
|
||||
|
||||
output = scheduler.step(model_output, t, sample, generator=generator)
|
||||
sample = output.prev_sample
|
||||
|
||||
result_sum = torch.sum(torch.abs(sample))
|
||||
result_mean = torch.mean(torch.abs(sample))
|
||||
|
||||
if str(torch_device).startswith("cpu"):
|
||||
assert abs(result_sum.item() - 13849.3945) < 1e-2
|
||||
assert abs(result_mean.item() - 18.0331) < 5e-3
|
||||
else:
|
||||
# CUDA
|
||||
assert abs(result_sum.item() - 13913.0459) < 1e-2
|
||||
assert abs(result_mean.item() - 18.1159) < 1e-3
|
||||
|
||||
@@ -626,12 +626,12 @@ class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest):
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_deprecated_predict_epsilon(self):
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
for predict_epsilon in [True, False]:
|
||||
self.check_over_configs(predict_epsilon=predict_epsilon)
|
||||
|
||||
def test_deprecated_predict_epsilon_to_prediction_type(self):
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
for scheduler_class in self.scheduler_classes:
|
||||
scheduler_config = self.get_scheduler_config(predict_epsilon=True)
|
||||
scheduler = scheduler_class.from_config(scheduler_config)
|
||||
|
||||
@@ -162,9 +162,3 @@ class DeprecateTester(unittest.TestCase):
|
||||
deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
|
||||
|
||||
assert str(warning.warning) == "This message is better!!!"
|
||||
|
||||
def test_deprecate_stacklevel(self):
|
||||
with self.assertWarns(FutureWarning) as warning:
|
||||
deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
|
||||
assert str(warning.warning) == "This message is better!!!"
|
||||
assert "diffusers/tests/test_utils.py" in warning.filename
|
||||
|
||||
Reference in New Issue
Block a user