Compare commits

...

18 Commits

Author SHA1 Message Date
Patrick von Platen
5e80827369 quick fix to make sure sdxl conversion works 2023-07-11 18:54:48 +02:00
Patrick von Platen
6894056e46 Release: v0.18.2 2023-07-11 18:38:27 +02:00
oOraph
c4402daff1 keep _use_default_values as a list type (#4040)
Signed-off-by: Raphael <oOraph@users.noreply.github.com>
Co-authored-by: Raphael <oOraph@users.noreply.github.com>
2023-07-11 18:30:30 +02:00
Patrick von Platen
a2fa787121 Improve single loading file (#4041)
* start improving single file load

* Fix more

* start improving single file load

* Fix sd 2.1

* further improve from_single_file
2023-07-11 18:30:25 +02:00
Lucain
ad7befeb2f FIX force_download in download utility (#4036)
FIX force_download in download utils
2023-07-11 18:30:17 +02:00
Pedro Cuenca
1f392ad45b Remove remaining not in upscale pipeline (#4020)
Remove remaining `not` in upscale pipeline.
2023-07-11 18:30:04 +02:00
Sayak Paul
fe5034c540 minor improvements to the SDXL doc. (#3985)
* minor improvements to the SDXL doc.

* use_refiner variable.

* fix: typo.
2023-07-11 18:29:54 +02:00
Pedro Cuenca
0f5e6454dc Correctly keep vae in float16 when using PyTorch 2 or xFormers (#4019)
* Update pipeline_stable_diffusion_xl.py

fix a bug

* Update pipeline_stable_diffusion_xl_img2img.py

* Update pipeline_stable_diffusion_xl_img2img.py

* Update pipeline_stable_diffusion_upscale.py

* style

---------

Co-authored-by: Hu Ye <xiaohuzc@gmail.com>
2023-07-11 18:29:41 +02:00
Patrick von Platen
638d2bbcd9 [DiffusionPipeline] Deprecate not throwing error when loading non-existant variant (#4011)
* Deprecate variant nicely

* make style

* Apply suggestions from code review

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
2023-07-11 18:29:32 +02:00
Patrick von Platen
4dfcfaa137 Make sure torch compile doesn't access unet config (#4008) 2023-07-11 18:29:19 +02:00
Patrick von Platen
1c0f6bb2cf Release: v0.18.1 2023-07-07 16:54:13 +02:00
Patrick von Platen
78922ed7c7 Add sdxl prompt embeddings (#3995)
* Add sdxl prompt embeddings

* Fix more

* fix some slow tests
2023-07-07 16:50:53 +02:00
Patrick von Platen
6fde5a6dd6 [Tests] Fix some slow tests (#3989)
fix some slow tests
2023-07-07 15:17:57 +02:00
Amiha
d1d0b8afce Don't use bare prints in a library (#3991) 2023-07-07 15:17:50 +02:00
Batuhan Taskaya
04ddad484e Add 'rank' parameter to Dreambooth LoRA training script (#3945) 2023-07-07 17:26:10 +05:30
Saurav Maheshkar
03d829d59e feat: add Dropout to Flax UNet (#3894)
* feat: add Dropout to Flax UNet

* feat: add @compact decorator

* fix: drop nn.compact
2023-07-07 11:38:16 +02:00
Omar Sanseviero
8d8b4311b9 Fix code snippet for Audio Diffusion (#3987) 2023-07-07 10:39:38 +02:00
Yorai Levi
1fbcc78d6e typo in safetensors (safetenstors) (#3976)
* Update pipeline_utils.py

typo in safetensors (safetenstors)

* Update loaders.py

typo in safetensors (safetenstors)

* Update modeling_utils.py

typo in safetensors (safetenstors)
2023-07-07 13:03:51 +05:30
36 changed files with 508 additions and 173 deletions

View File

@@ -43,7 +43,7 @@ pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to(devic
output = pipe()
display(output.images[0])
display(Audio(output.audios[0], rate=mel.get_sample_rate()))
display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
```
### Latent Audio Diffusion

View File

@@ -21,7 +21,7 @@ The abstract of the paper is the following:
## Tips
- Stable Diffusion XL works especially well with images between 768 and 1024.
- Stable Diffusion XL output image can be improved by making use of a refiner as shown below
- Stable Diffusion XL output image can be improved by making use of a refiner as shown below.
### Available checkpoints:
@@ -40,7 +40,7 @@ pip install safetensors
pip install invisible-watermark>=2.0
```
### *Text-to-Image*
### Text-to-Image
You can use SDXL as follows for *text-to-image*:
@@ -71,6 +71,7 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
)
pipe.to("cuda")
use_refiner = True
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
)
@@ -82,7 +83,29 @@ image = pipe(prompt=prompt, output_type="latent" if use_refiner else "pil").imag
image = refiner(prompt=prompt, image=image[None, :]).images[0]
```
### Loading single file checkpoitns / original file format
### Image-to-image
```py
import torch
from diffusers import StableDiffusionXLImg2ImgPipeline
from diffusers.utils import load_image
pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
init_image = load_image(url).convert("RGB")
prompt = "a photo of an astronaut riding a horse on mars"
image = pipe(prompt, image=init_image).images[0]
```
| Original Image | Refined Image |
|---|---|
| ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/init_image.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/refined_image.png) |
### Loading single file checkpoints / original file format
By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the
original file format into `diffusers`:
@@ -127,7 +150,7 @@ You can speed up inference by making use of `torch.compile`. This should give yo
+ refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True)
```
### Running with `torch` < 2.0
### Running with `torch` \< 2.0
**Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers
attention:

View File

@@ -56,7 +56,7 @@ if is_wandb_available():
import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__)

View File

@@ -59,7 +59,7 @@ if is_wandb_available():
import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = logging.getLogger(__name__)

View File

@@ -57,7 +57,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__)

View File

@@ -59,7 +59,7 @@ if is_wandb_available():
import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__)

View File

@@ -36,7 +36,7 @@ from diffusers.utils import check_min_version
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
# Cache compiled models across invocations of this script.
cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))

View File

@@ -65,7 +65,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__)
@@ -436,6 +436,12 @@ def parse_args(input_args=None):
default=None,
help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.",
)
parser.add_argument(
"--rank",
type=int,
default=4,
help=("The dimension of the LoRA update matrices."),
)
if input_args is not None:
args = parser.parse_args(input_args)
@@ -845,7 +851,9 @@ def main(args):
LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
)
unet_lora_attn_procs[name] = lora_attn_processor_class(
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
hidden_size=hidden_size,
cross_attention_dim=cross_attention_dim,
rank=args.rank,
)
unet.set_attn_processor(unet_lora_attn_procs)
@@ -860,7 +868,9 @@ def main(args):
for name, module in text_encoder.named_modules():
if name.endswith(TEXT_ENCODER_ATTN_MODULE):
text_lora_attn_procs[name] = LoRAAttnProcessor(
hidden_size=module.out_proj.out_features, cross_attention_dim=None
hidden_size=module.out_proj.out_features,
cross_attention_dim=None,
rank=args.rank,
)
text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
temp_pipeline = DiffusionPipeline.from_pretrained(

View File

@@ -52,7 +52,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO")

View File

@@ -54,7 +54,7 @@ if is_wandb_available():
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO")

View File

@@ -33,7 +33,7 @@ from diffusers.utils import check_min_version
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = logging.getLogger(__name__)

View File

@@ -48,7 +48,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO")

View File

@@ -78,7 +78,7 @@ else:
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__)

View File

@@ -56,7 +56,7 @@ else:
# ------------------------------------------------------------------------------
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = logging.getLogger(__name__)

View File

@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0")
check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO")

View File

@@ -232,7 +232,7 @@ install_requires = [
setup(
name="diffusers",
version="0.18.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="0.18.2", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description="Diffusers",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",

View File

@@ -1,4 +1,4 @@
__version__ = "0.18.0.dev0"
__version__ = "0.18.2"
from .configuration_utils import ConfigMixin
from .utils import (

View File

@@ -607,7 +607,7 @@ def register_to_config(init):
# Take note of the parameters that were not present in the loaded config
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)
new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
new_kwargs = {**config_init_kwargs, **new_kwargs}
getattr(self, "register_to_config")(**new_kwargs)
@@ -655,7 +655,7 @@ def flax_register_to_config(cls):
# Take note of the parameters that were not present in the loaded config
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)
new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
getattr(self, "register_to_config")(**new_kwargs)
original_init(self, *args, **kwargs)

View File

@@ -177,7 +177,7 @@ class UNet2DConditionLoadersMixin:
if use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
)
allow_pickle = False
@@ -589,7 +589,7 @@ class TextualInversionLoaderMixin:
if use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
)
allow_pickle = False
@@ -806,7 +806,7 @@ class LoraLoaderMixin:
if use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
)
allow_pickle = False
@@ -1054,7 +1054,7 @@ class LoraLoaderMixin:
if use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
)
allow_pickle = False
@@ -1394,7 +1394,7 @@ class FromSingleFileMixin:
use_auth_token = kwargs.pop("use_auth_token", None)
revision = kwargs.pop("revision", None)
extract_ema = kwargs.pop("extract_ema", False)
image_size = kwargs.pop("image_size", 512)
image_size = kwargs.pop("image_size", None)
scheduler_type = kwargs.pop("scheduler_type", "pndm")
num_in_channels = kwargs.pop("num_in_channels", None)
upcast_attention = kwargs.pop("upcast_attention", None)

View File

@@ -152,6 +152,7 @@ class FlaxAttention(nn.Module):
self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
self.dropout_layer = nn.Dropout(rate=self.dropout)
def reshape_heads_to_batch_dim(self, tensor):
batch_size, seq_len, dim = tensor.shape
@@ -214,7 +215,7 @@ class FlaxAttention(nn.Module):
hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
hidden_states = self.proj_attn(hidden_states)
return hidden_states
return self.dropout_layer(hidden_states, deterministic=deterministic)
class FlaxBasicTransformerBlock(nn.Module):
@@ -260,6 +261,7 @@ class FlaxBasicTransformerBlock(nn.Module):
self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
self.dropout_layer = nn.Dropout(rate=self.dropout)
def __call__(self, hidden_states, context, deterministic=True):
# self attention
@@ -280,7 +282,7 @@ class FlaxBasicTransformerBlock(nn.Module):
hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
hidden_states = hidden_states + residual
return hidden_states
return self.dropout_layer(hidden_states, deterministic=deterministic)
class FlaxTransformer2DModel(nn.Module):
@@ -356,6 +358,8 @@ class FlaxTransformer2DModel(nn.Module):
dtype=self.dtype,
)
self.dropout_layer = nn.Dropout(rate=self.dropout)
def __call__(self, hidden_states, context, deterministic=True):
batch, height, width, channels = hidden_states.shape
residual = hidden_states
@@ -378,7 +382,7 @@ class FlaxTransformer2DModel(nn.Module):
hidden_states = self.proj_out(hidden_states)
hidden_states = hidden_states + residual
return hidden_states
return self.dropout_layer(hidden_states, deterministic=deterministic)
class FlaxFeedForward(nn.Module):
@@ -409,7 +413,7 @@ class FlaxFeedForward(nn.Module):
self.net_2 = nn.Dense(self.dim, dtype=self.dtype)
def __call__(self, hidden_states, deterministic=True):
hidden_states = self.net_0(hidden_states)
hidden_states = self.net_0(hidden_states, deterministic=deterministic)
hidden_states = self.net_2(hidden_states)
return hidden_states
@@ -434,8 +438,9 @@ class FlaxGEGLU(nn.Module):
def setup(self):
inner_dim = self.dim * 4
self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
self.dropout_layer = nn.Dropout(rate=self.dropout)
def __call__(self, hidden_states, deterministic=True):
hidden_states = self.proj(hidden_states)
hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
return hidden_linear * nn.gelu(hidden_gelu)
return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)

View File

@@ -456,7 +456,7 @@ class ModelMixin(torch.nn.Module):
if use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
)
allow_pickle = False

View File

@@ -204,7 +204,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi
transformers_index_format = r"\d{5}-of-\d{5}"
if variant is not None:
# `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetenstors`
# `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors`
variant_file_re = re.compile(
rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
)
@@ -213,7 +213,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi
rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
)
# `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetenstors`
# `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
non_variant_file_re = re.compile(
rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
)
@@ -1168,7 +1168,7 @@ class DiffusionPipeline(ConfigMixin):
if use_safetensors and not is_safetensors_available():
raise ValueError(
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
"`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
)
allow_pickle = False
@@ -1213,6 +1213,15 @@ class DiffusionPipeline(ConfigMixin):
filenames = {sibling.rfilename for sibling in info.siblings}
model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
if len(variant_filenames) == 0 and variant is not None:
deprecation_message = (
f"You are trying to load the model files of the `variant={variant}`, but no such modeling files are available."
f"The default model files: {model_filenames} will be loaded instead. Make sure to not load from `variant={variant}`"
"if such variant modeling files are not available. Doing so will lead to an error in v0.22.0 as defaulting to non-variant"
"modeling files is deprecated."
)
deprecate("no variant default", "0.22.0", deprecation_message, standard_warn=False)
# remove ignored filenames
model_filenames = set(model_filenames) - set(ignore_filenames)
variant_filenames = set(variant_filenames) - set(ignore_filenames)
@@ -1302,7 +1311,7 @@ class DiffusionPipeline(ConfigMixin):
snapshot_folder = Path(config_file).parent
pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
if pipeline_is_cached:
if pipeline_is_cached and not force_download:
# if the pipeline is cached, we can directly return it
# else call snapshot_download
return snapshot_folder

View File

@@ -24,6 +24,7 @@ from transformers import (
AutoFeatureExtractor,
BertTokenizerFast,
CLIPImageProcessor,
CLIPTextConfig,
CLIPTextModel,
CLIPTextModelWithProjection,
CLIPTokenizer,
@@ -48,7 +49,7 @@ from ...schedulers import (
PNDMScheduler,
UnCLIPScheduler,
)
from ...utils import is_omegaconf_available, is_safetensors_available, logging
from ...utils import is_accelerate_available, is_omegaconf_available, is_safetensors_available, logging
from ...utils.import_utils import BACKENDS_MAPPING
from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
from ..paint_by_example import PaintByExampleImageEncoder
@@ -57,6 +58,10 @@ from .safety_checker import StableDiffusionSafetyChecker
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
if is_accelerate_available():
from accelerate import init_empty_weights
from accelerate.utils import set_module_tensor_to_device
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -391,8 +396,8 @@ def convert_ldm_unet_checkpoint(
# at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
print(f"Checkpoint {path} has both EMA and non-EMA weights.")
print(
logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
logger.warning(
"In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
" weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
)
@@ -402,7 +407,7 @@ def convert_ldm_unet_checkpoint(
unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
else:
if sum(k.startswith("model_ema") for k in keys) > 100:
print(
logger.warning(
"In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
" weights (usually better for inference), please make sure to add the `--extract_ema` flag."
)
@@ -770,11 +775,12 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
text_model = (
CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
if text_encoder is None
else text_encoder
)
if text_encoder is None:
config_name = "openai/clip-vit-large-patch14"
config = CLIPTextConfig.from_pretrained(config_name)
with init_empty_weights():
text_model = CLIPTextModel(config)
keys = list(checkpoint.keys())
@@ -787,7 +793,8 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder
if key.startswith(prefix):
text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
text_model.load_state_dict(text_model_dict)
for param_name, param in text_model_dict.items():
set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
return text_model
@@ -884,14 +891,26 @@ def convert_paint_by_example_checkpoint(checkpoint):
return model
def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
def convert_open_clip_checkpoint(
checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs
):
# text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
text_model = CLIPTextModelWithProjection.from_pretrained(
"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
)
# text_model = CLIPTextModelWithProjection.from_pretrained(
# "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
# )
config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs)
with init_empty_weights():
text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
keys = list(checkpoint.keys())
keys_to_ignore = []
if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
# make sure to remove all keys > 22
keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
keys_to_ignore += ["cond_stage_model.model.text_projection"]
text_model_dict = {}
if prefix + "text_projection" in checkpoint:
@@ -902,8 +921,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
for key in keys:
# if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer
# continue
if key in keys_to_ignore:
continue
if key[len(prefix) :] in textenc_conversion_map:
if key.endswith("text_projection"):
value = checkpoint[key].T
@@ -931,7 +950,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
text_model_dict[new_key] = checkpoint[key]
text_model.load_state_dict(text_model_dict)
for param_name, param in text_model_dict.items():
set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
return text_model
@@ -1061,7 +1081,7 @@ def convert_controlnet_checkpoint(
def download_from_original_stable_diffusion_ckpt(
checkpoint_path: str,
original_config_file: str = None,
image_size: int = 512,
image_size: Optional[int] = None,
prediction_type: str = None,
model_type: str = None,
extract_ema: bool = False,
@@ -1144,6 +1164,7 @@ def download_from_original_stable_diffusion_ckpt(
LDMTextToImagePipeline,
PaintByExamplePipeline,
StableDiffusionControlNetPipeline,
StableDiffusionInpaintPipeline,
StableDiffusionPipeline,
StableDiffusionXLImg2ImgPipeline,
StableDiffusionXLPipeline,
@@ -1166,12 +1187,9 @@ def download_from_original_stable_diffusion_ckpt(
if not is_safetensors_available():
raise ValueError(BACKENDS_MAPPING["safetensors"][1])
from safetensors import safe_open
from safetensors.torch import load_file as safe_load
checkpoint = {}
with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
for key in f.keys():
checkpoint[key] = f.get_tensor(key)
checkpoint = safe_load(checkpoint_path, device="cpu")
else:
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -1183,7 +1201,7 @@ def download_from_original_stable_diffusion_ckpt(
if "global_step" in checkpoint:
global_step = checkpoint["global_step"]
else:
print("global_step key not found in model")
logger.debug("global_step key not found in model")
global_step = None
# NOTE: this while loop isn't great but this controlnet checkpoint has one additional
@@ -1230,8 +1248,15 @@ def download_from_original_stable_diffusion_ckpt(
model_type = "SDXL"
else:
model_type = "SDXL-Refiner"
if image_size is None:
image_size = 1024
if num_in_channels is not None:
if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline:
num_in_channels = 9
elif num_in_channels is None:
num_in_channels = 4
if "unet_config" in original_config.model.params:
original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
if (
@@ -1263,7 +1288,6 @@ def download_from_original_stable_diffusion_ckpt(
num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
if model_type in ["SDXL", "SDXL-Refiner"]:
image_size = 1024
scheduler_dict = {
"beta_schedule": "scaled_linear",
"beta_start": 0.00085,
@@ -1279,7 +1303,6 @@ def download_from_original_stable_diffusion_ckpt(
}
scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
scheduler_type = "euler"
vae_path = "stabilityai/sdxl-vae"
else:
beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
@@ -1318,25 +1341,45 @@ def download_from_original_stable_diffusion_ckpt(
# Convert the UNet2DConditionModel model.
unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
unet_config["upcast_attention"] = upcast_attention
unet = UNet2DConditionModel(**unet_config)
with init_empty_weights():
unet = UNet2DConditionModel(**unet_config)
converted_unet_checkpoint = convert_ldm_unet_checkpoint(
checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
)
unet.load_state_dict(converted_unet_checkpoint)
for param_name, param in converted_unet_checkpoint.items():
set_module_tensor_to_device(unet, param_name, "cpu", value=param)
# Convert the VAE model.
if vae_path is None:
vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL(**vae_config)
vae.load_state_dict(converted_vae_checkpoint)
if (
"model" in original_config
and "params" in original_config.model
and "scale_factor" in original_config.model.params
):
vae_scaling_factor = original_config.model.params.scale_factor
else:
vae_scaling_factor = 0.18215 # default SD scaling factor
vae_config["scaling_factor"] = vae_scaling_factor
with init_empty_weights():
vae = AutoencoderKL(**vae_config)
for param_name, param in converted_vae_checkpoint.items():
set_module_tensor_to_device(vae, param_name, "cpu", value=param)
else:
vae = AutoencoderKL.from_pretrained(vae_path)
if model_type == "FrozenOpenCLIPEmbedder":
text_model = convert_open_clip_checkpoint(checkpoint)
config_name = "stabilityai/stable-diffusion-2"
config_kwargs = {"subfolder": "text_encoder"}
text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
if stable_unclip is None:
@@ -1469,7 +1512,12 @@ def download_from_original_stable_diffusion_ckpt(
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.1.model.")
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
config_kwargs = {"projection_dim": 1280}
text_encoder_2 = convert_open_clip_checkpoint(
checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs
)
pipe = StableDiffusionXLPipeline(
vae=vae,
@@ -1485,7 +1533,12 @@ def download_from_original_stable_diffusion_ckpt(
tokenizer = None
text_encoder = None
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.0.model.")
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
config_kwargs = {"projection_dim": 1280}
text_encoder_2 = convert_open_clip_checkpoint(
checkpoint, config_name, prefix="conditioner.embedders.0.model.", has_projection=True, **config_kwargs
)
pipe = StableDiffusionXLImg2ImgPipeline(
vae=vae,

View File

@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
@@ -153,7 +153,9 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
return mask, masked_image
class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
class StableDiffusionInpaintPipeline(
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-guided image inpainting using Stable Diffusion.

View File

@@ -748,15 +748,19 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
# make sure the VAE is in float32 mode, as it overflows in float16
self.vae.to(dtype=torch.float32)
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
AttnProcessor2_0,
XFormersAttnProcessor,
LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
]
use_torch_2_0_or_xformers = isinstance(
self.vae.decoder.mid_block.attentions[0].processor,
(
AttnProcessor2_0,
XFormersAttnProcessor,
LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
),
)
# if xformers or torch_2_0 is used attention block does not need
# to be in float32 which can save lots of memory
if not use_torch_2_0_or_xformers:
if use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(latents.dtype)
self.vae.decoder.conv_in.to(latents.dtype)
self.vae.decoder.mid_block.to(latents.dtype)

View File

@@ -8,7 +8,6 @@ from ...utils import BaseOutput, is_invisible_watermark_available, is_torch_avai
@dataclass
# Copied from diffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with StableDiffusion->StableDiffusionXL
class StableDiffusionXLPipelineOutput(BaseOutput):
"""
Output class for Stable Diffusion pipelines.
@@ -17,13 +16,9 @@ class StableDiffusionXLPipelineOutput(BaseOutput):
images (`List[PIL.Image.Image]` or `np.ndarray`)
List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
nsfw_content_detected (`List[bool]`)
List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
(nsfw) content, or `None` if safety checking could not be performed.
"""
images: Union[List[PIL.Image.Image], np.ndarray]
nsfw_content_detected: Optional[List[bool]]
if is_transformers_available() and is_torch_available() and is_invisible_watermark_available():

View File

@@ -129,9 +129,11 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.default_sample_size = self.unet.config.sample_size
self.watermark = StableDiffusionXLWatermarker()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
def enable_vae_slicing(self):
r"""
Enable sliced VAE decoding.
@@ -141,6 +143,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
"""
self.vae.enable_slicing()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
def disable_vae_slicing(self):
r"""
Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -148,6 +151,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
"""
self.vae.disable_slicing()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
def enable_vae_tiling(self):
r"""
Enable tiled VAE decoding.
@@ -157,6 +161,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
"""
self.vae.enable_tiling()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
def disable_vae_tiling(self):
r"""
Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
@@ -183,7 +188,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
for cpu_offloaded_model in [self.unet, self.text_encoder, self.text_encoder_2, self.vae]:
cpu_offload(cpu_offloaded_model, device)
def enable_model_cpu_offload(self, gpu_id=0):
@@ -217,6 +222,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
@@ -237,12 +243,14 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
def encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
device: Optional[torch.device] = None,
num_images_per_prompt: int = 1,
do_classifier_free_guidance: bool = True,
negative_prompt=None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None,
):
r"""
@@ -268,9 +276,18 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument.
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
input argument.
lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
"""
device = device or self._execution_device
# set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(self, LoraLoaderMixin):
@@ -399,6 +416,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
bs_embed = pooled_prompt_embeds.shape[0]
pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
bs_embed * num_images_per_prompt, -1
)
@@ -408,20 +426,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
else:
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
feature_extractor_input = self.image_processor.numpy_to_pil(image)
safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -448,6 +453,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
pooled_prompt_embeds=None,
negative_pooled_prompt_embeds=None,
):
if height % 8 != 0 or width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -486,6 +493,17 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
f" {negative_prompt_embeds.shape}."
)
if prompt_embeds is not None and pooled_prompt_embeds is None:
raise ValueError(
"If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
)
if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
raise ValueError(
"If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
)
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
if isinstance(generator, list) and len(generator) != batch_size:
@@ -535,6 +553,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -588,6 +608,13 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument.
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -626,15 +653,23 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
"not-safe-for-work" (nsfw) content, according to the `safety_checker`.
"""
# 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor
width = width or self.unet.config.sample_size * self.vae_scale_factor
height = height or self.default_sample_size * self.vae_scale_factor
width = width or self.default_sample_size * self.vae_scale_factor
original_size = original_size or (height, width)
target_size = target_size or (height, width)
# 1. Check inputs. Raise error if not correct
self.check_inputs(
prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
prompt,
height,
width,
callback_steps,
negative_prompt,
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
)
# 2. Define call parameters
@@ -669,6 +704,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale,
)
@@ -749,15 +786,18 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
# make sure the VAE is in float32 mode, as it overflows in float16
self.vae.to(dtype=torch.float32)
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
AttnProcessor2_0,
XFormersAttnProcessor,
LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
]
use_torch_2_0_or_xformers = isinstance(
self.vae.decoder.mid_block.attentions[0].processor,
(
AttnProcessor2_0,
XFormersAttnProcessor,
LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
),
)
# if xformers or torch_2_0 is used attention block does not need
# to be in float32 which can save lots of memory
if not use_torch_2_0_or_xformers:
if use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(latents.dtype)
self.vae.decoder.conv_in.to(latents.dtype)
self.vae.decoder.mid_block.to(latents.dtype)
@@ -765,27 +805,19 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
latents = latents.float()
if not output_type == "latent":
# CHECK there is problem here (PVP)
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
has_nsfw_concept = None
else:
image = latents
has_nsfw_concept = None
return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None)
if has_nsfw_concept is None:
do_denormalize = [True] * image.shape[0]
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
return StableDiffusionXLPipelineOutput(images=image)
image = self.watermark.apply_watermark(image)
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
image = self.image_processor.postprocess(image, output_type=output_type)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
if not return_dict:
return (image, has_nsfw_concept)
return (image,)
return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
return StableDiffusionXLPipelineOutput(images=image)

View File

@@ -140,6 +140,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
self.watermark = StableDiffusionXLWatermarker()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
def enable_vae_slicing(self):
r"""
Enable sliced VAE decoding.
@@ -149,6 +150,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
"""
self.vae.enable_slicing()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
def disable_vae_slicing(self):
r"""
Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -156,6 +158,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
"""
self.vae.disable_slicing()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
def enable_vae_tiling(self):
r"""
Enable tiled VAE decoding.
@@ -165,6 +168,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
"""
self.vae.enable_tiling()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
def disable_vae_tiling(self):
r"""
Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
@@ -172,6 +176,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
"""
self.vae.disable_tiling()
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_sequential_cpu_offload
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -191,9 +196,10 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
for cpu_offloaded_model in [self.unet, self.text_encoder, self.text_encoder_2, self.vae]:
cpu_offload(cpu_offloaded_model, device)
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
@@ -225,6 +231,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
@@ -242,15 +249,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
return torch.device(module._hf_hook.execution_device)
return self.device
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
device: Optional[torch.device] = None,
num_images_per_prompt: int = 1,
do_classifier_free_guidance: bool = True,
negative_prompt=None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None,
):
r"""
@@ -276,9 +286,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument.
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
input argument.
lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
"""
device = device or self._execution_device
# set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(self, LoraLoaderMixin):
@@ -327,13 +346,11 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2]
prompt_embeds = prompt_embeds
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
@@ -349,10 +366,9 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
negative_prompt_embeds = torch.zeros_like(prompt_embeds)
negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
elif do_classifier_free_guidance and negative_prompt_embeds is None:
negative_prompt = negative_prompt or ""
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
elif prompt is not None and type(prompt) is not type(negative_prompt):
if prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
@@ -389,7 +405,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
)
# We are only ALWAYS interested in the pooled output of the final text encoder
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
if do_classifier_free_guidance:
@@ -411,6 +426,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
bs_embed = pooled_prompt_embeds.shape[0]
pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
bs_embed * num_images_per_prompt, -1
)
@@ -420,20 +436,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
else:
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
feature_extractor_input = self.image_processor.numpy_to_pil(image)
safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
return image, has_nsfw_concept
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -624,6 +627,8 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -683,6 +688,13 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument.
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -759,6 +771,8 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale,
)
@@ -845,15 +859,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
# make sure the VAE is in float32 mode, as it overflows in float16
self.vae.to(dtype=torch.float32)
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
AttnProcessor2_0,
XFormersAttnProcessor,
LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
]
use_torch_2_0_or_xformers = isinstance(
self.vae.decoder.mid_block.attentions[0].processor,
(
AttnProcessor2_0,
XFormersAttnProcessor,
LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
),
)
# if xformers or torch_2_0 is used attention block does not need
# to be in float32 which can save lots of memory
if not use_torch_2_0_or_xformers:
if use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(latents.dtype)
self.vae.decoder.conv_in.to(latents.dtype)
self.vae.decoder.mid_block.to(latents.dtype)
@@ -862,24 +879,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
if not output_type == "latent":
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
has_nsfw_concept = None
else:
image = latents
return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None)
if has_nsfw_concept is None:
do_denormalize = [True] * image.shape[0]
else:
do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
return StableDiffusionXLPipelineOutput(images=image)
image = self.watermark.apply_watermark(image)
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
image = self.image_processor.postprocess(image, output_type=output_type)
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
if not return_dict:
return (image, has_nsfw_concept)
return (image,)
return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
return StableDiffusionXLPipelineOutput(images=image)

View File

@@ -264,7 +264,7 @@ class ConfigTester(unittest.TestCase):
config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")}
# make sure that default config has all keys in `_use_default_values`
assert set(config_dict.keys()) == config.config._use_default_values
assert set(config_dict.keys()) == set(config.config._use_default_values)
with tempfile.TemporaryDirectory() as tmpdirname:
config.save_config(tmpdirname)

View File

@@ -20,17 +20,20 @@ import unittest
import numpy as np
import torch
from huggingface_hub import hf_hub_download
from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
DPMSolverMultistepScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
StableDiffusionInpaintPipeline,
UNet2DConditionModel,
)
from diffusers.models.attention_processor import AttnProcessor
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
from diffusers.utils.testing_utils import (
@@ -512,6 +515,42 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
assert np.abs(expected_slice - image_slice).max() < 6e-4
def test_download_local(self):
filename = hf_hub_download("runwayml/stable-diffusion-inpainting", filename="sd-v1-5-inpainting.ckpt")
pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 1
image_out = pipe(**inputs).images[0]
assert image_out.shape == (512, 512, 3)
def test_download_ckpt_diff_format_is_same(self):
ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.unet.set_attn_processor(AttnProcessor())
pipe.to("cuda")
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 5
image_ckpt = pipe(**inputs).images[0]
pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.unet.set_attn_processor(AttnProcessor())
pipe.to("cuda")
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 5
image = pipe(**inputs).images[0]
assert np.max(np.abs(image - image_ckpt)) < 1e-4
@nightly
@require_torch_gpu

View File

@@ -19,6 +19,7 @@ import unittest
import numpy as np
import torch
from huggingface_hub import hf_hub_download
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
@@ -29,6 +30,7 @@ from diffusers import (
StableDiffusionPipeline,
UNet2DConditionModel,
)
from diffusers.models.attention_processor import AttnProcessor
from diffusers.utils import load_numpy, slow, torch_device
from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
@@ -426,6 +428,40 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
assert image.shape == (768, 768, 3)
assert np.abs(expected_image - image).max() < 7.5e-1
def test_download_local(self):
filename = hf_hub_download("stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.safetensors")
pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
assert image_out.shape == (768, 768, 3)
def test_download_ckpt_diff_format_is_same(self):
single_file_path = (
"https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
)
pipe_single = StableDiffusionPipeline.from_single_file(single_file_path)
pipe_single.scheduler = DDIMScheduler.from_config(pipe_single.scheduler.config)
pipe_single.unet.set_attn_processor(AttnProcessor())
pipe_single.to("cuda")
generator = torch.Generator(device="cpu").manual_seed(0)
image_ckpt = pipe_single("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.unet.set_attn_processor(AttnProcessor())
pipe.to("cuda")
generator = torch.Generator(device="cpu").manual_seed(0)
image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
assert np.max(np.abs(image - image_ckpt)) < 1e-3
def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
number_of_steps = 0

View File

@@ -144,6 +144,46 @@ class StableDiffusionXLPipelineFastTests(PipelineLatentTesterMixin, PipelineTest
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_xl_negative_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
def test_attention_slicing_forward_pass(self):
super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)

View File

@@ -165,6 +165,46 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
def test_save_load_optional_components(self):
pass
def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
@slow
@require_torch_gpu

View File

@@ -14,6 +14,7 @@
# limitations under the License.
import gc
import glob
import json
import os
import random
@@ -56,6 +57,7 @@ from diffusers import (
UniPCMultistepScheduler,
logging,
)
from diffusers.pipelines.pipeline_utils import variant_compatible_siblings
from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
from diffusers.utils import (
CONFIG_NAME,
@@ -1361,6 +1363,29 @@ class PipelineFastTests(unittest.TestCase):
assert sd.config.safety_checker != (None, None)
assert sd.config.feature_extractor != (None, None)
def test_warning_no_variant_available(self):
variant = "fp16"
with self.assertWarns(FutureWarning) as warning_context:
cached_folder = StableDiffusionPipeline.download(
"hf-internal-testing/diffusers-stable-diffusion-tiny-all", variant=variant
)
assert "but no such modeling files are available" in str(warning_context.warning)
assert variant in str(warning_context.warning)
def get_all_filenames(directory):
filenames = glob.glob(directory + "/**", recursive=True)
filenames = [f for f in filenames if os.path.isfile(f)]
return filenames
filenames = get_all_filenames(str(cached_folder))
all_model_files, variant_model_files = variant_compatible_siblings(filenames, variant=variant)
# make sure that none of the model names are variant model names
assert len(variant_model_files) == 0
assert len(all_model_files) > 0
@slow
@require_torch_gpu

View File

@@ -699,12 +699,16 @@ class PipelineTesterMixin:
inputs = self.get_dummy_inputs(torch_device)
output_without_offload = pipe(**inputs)[0]
output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
output_without_offload = (
output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
)
pipe.enable_xformers_memory_efficient_attention()
inputs = self.get_dummy_inputs(torch_device)
output_with_offload = pipe(**inputs)[0]
output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
output_with_offload = (
output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
)
if test_max_difference:
max_diff = np.abs(output_with_offload - output_without_offload).max()

View File

@@ -26,7 +26,7 @@ from diffusers import (
TextToVideoSDPipeline,
UNet3DConditionModel,
)
from diffusers.utils import load_numpy, skip_mps, slow
from diffusers.utils import is_xformers_available, load_numpy, skip_mps, slow, torch_device
from diffusers.utils.testing_utils import enable_full_determinism
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -143,6 +143,13 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
# (todo): sayakpaul
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_consistent(self):