Compare commits

...

11 Commits

Author SHA1 Message Date
Patrick von Platen
5e80827369 quick fix to make sure sdxl conversion works 2023-07-11 18:54:48 +02:00
Patrick von Platen
6894056e46 Release: v0.18.2 2023-07-11 18:38:27 +02:00
oOraph
c4402daff1 keep _use_default_values as a list type (#4040)
Signed-off-by: Raphael <oOraph@users.noreply.github.com>
Co-authored-by: Raphael <oOraph@users.noreply.github.com>
2023-07-11 18:30:30 +02:00
Patrick von Platen
a2fa787121 Improve single loading file (#4041)
* start improving single file load

* Fix more

* start improving single file load

* Fix sd 2.1

* further improve from_single_file
2023-07-11 18:30:25 +02:00
Lucain
ad7befeb2f FIX force_download in download utility (#4036)
FIX force_download in download utils
2023-07-11 18:30:17 +02:00
Pedro Cuenca
1f392ad45b Remove remaining not in upscale pipeline (#4020)
Remove remaining `not` in upscale pipeline.
2023-07-11 18:30:04 +02:00
Sayak Paul
fe5034c540 minor improvements to the SDXL doc. (#3985)
* minor improvements to the SDXL doc.

* use_refiner variable.

* fix: typo.
2023-07-11 18:29:54 +02:00
Pedro Cuenca
0f5e6454dc Correctly keep vae in float16 when using PyTorch 2 or xFormers (#4019)
* Update pipeline_stable_diffusion_xl.py

fix a bug

* Update pipeline_stable_diffusion_xl_img2img.py

* Update pipeline_stable_diffusion_xl_img2img.py

* Update pipeline_stable_diffusion_upscale.py

* style

---------

Co-authored-by: Hu Ye <xiaohuzc@gmail.com>
2023-07-11 18:29:41 +02:00
Patrick von Platen
638d2bbcd9 [DiffusionPipeline] Deprecate not throwing error when loading non-existant variant (#4011)
* Deprecate variant nicely

* make style

* Apply suggestions from code review

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
2023-07-11 18:29:32 +02:00
Patrick von Platen
4dfcfaa137 Make sure torch compile doesn't access unet config (#4008) 2023-07-11 18:29:19 +02:00
Patrick von Platen
1c0f6bb2cf Release: v0.18.1 2023-07-07 16:54:13 +02:00
28 changed files with 278 additions and 80 deletions

View File

@@ -21,7 +21,7 @@ The abstract of the paper is the following:
## Tips ## Tips
- Stable Diffusion XL works especially well with images between 768 and 1024. - Stable Diffusion XL works especially well with images between 768 and 1024.
- Stable Diffusion XL output image can be improved by making use of a refiner as shown below - Stable Diffusion XL output image can be improved by making use of a refiner as shown below.
### Available checkpoints: ### Available checkpoints:
@@ -40,7 +40,7 @@ pip install safetensors
pip install invisible-watermark>=2.0 pip install invisible-watermark>=2.0
``` ```
### *Text-to-Image* ### Text-to-Image
You can use SDXL as follows for *text-to-image*: You can use SDXL as follows for *text-to-image*:
@@ -71,6 +71,7 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
) )
pipe.to("cuda") pipe.to("cuda")
use_refiner = True
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained( refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16" "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
) )
@@ -82,7 +83,29 @@ image = pipe(prompt=prompt, output_type="latent" if use_refiner else "pil").imag
image = refiner(prompt=prompt, image=image[None, :]).images[0] image = refiner(prompt=prompt, image=image[None, :]).images[0]
``` ```
### Loading single file checkpoitns / original file format ### Image-to-image
```py
import torch
from diffusers import StableDiffusionXLImg2ImgPipeline
from diffusers.utils import load_image
pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
init_image = load_image(url).convert("RGB")
prompt = "a photo of an astronaut riding a horse on mars"
image = pipe(prompt, image=init_image).images[0]
```
| Original Image | Refined Image |
|---|---|
| ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/init_image.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/refined_image.png) |
### Loading single file checkpoints / original file format
By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the
original file format into `diffusers`: original file format into `diffusers`:
@@ -127,7 +150,7 @@ You can speed up inference by making use of `torch.compile`. This should give yo
+ refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True) + refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True)
``` ```
### Running with `torch` < 2.0 ### Running with `torch` \< 2.0
**Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers **Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers
attention: attention:

View File

@@ -56,7 +56,7 @@ if is_wandb_available():
import wandb import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__) logger = get_logger(__name__)

View File

@@ -59,7 +59,7 @@ if is_wandb_available():
import wandb import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@@ -57,7 +57,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__) logger = get_logger(__name__)

View File

@@ -59,7 +59,7 @@ if is_wandb_available():
import wandb import wandb
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__) logger = get_logger(__name__)

View File

@@ -36,7 +36,7 @@ from diffusers.utils import check_min_version
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
# Cache compiled models across invocations of this script. # Cache compiled models across invocations of this script.
cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache")) cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))

View File

@@ -65,7 +65,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__) logger = get_logger(__name__)

View File

@@ -52,7 +52,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO") logger = get_logger(__name__, log_level="INFO")

View File

@@ -54,7 +54,7 @@ if is_wandb_available():
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO") logger = get_logger(__name__, log_level="INFO")

View File

@@ -33,7 +33,7 @@ from diffusers.utils import check_min_version
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@@ -48,7 +48,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO") logger = get_logger(__name__, log_level="INFO")

View File

@@ -78,7 +78,7 @@ else:
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__) logger = get_logger(__name__)

View File

@@ -56,7 +56,7 @@ else:
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View File

@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.18.0.dev0") check_min_version("0.18.0")
logger = get_logger(__name__, log_level="INFO") logger = get_logger(__name__, log_level="INFO")

View File

@@ -232,7 +232,7 @@ install_requires = [
setup( setup(
name="diffusers", name="diffusers",
version="0.18.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) version="0.18.2", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description="Diffusers", description="Diffusers",
long_description=open("README.md", "r", encoding="utf-8").read(), long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",

View File

@@ -1,4 +1,4 @@
__version__ = "0.18.0.dev0" __version__ = "0.18.2"
from .configuration_utils import ConfigMixin from .configuration_utils import ConfigMixin
from .utils import ( from .utils import (

View File

@@ -607,7 +607,7 @@ def register_to_config(init):
# Take note of the parameters that were not present in the loaded config # Take note of the parameters that were not present in the loaded config
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0: if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs) new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
new_kwargs = {**config_init_kwargs, **new_kwargs} new_kwargs = {**config_init_kwargs, **new_kwargs}
getattr(self, "register_to_config")(**new_kwargs) getattr(self, "register_to_config")(**new_kwargs)
@@ -655,7 +655,7 @@ def flax_register_to_config(cls):
# Take note of the parameters that were not present in the loaded config # Take note of the parameters that were not present in the loaded config
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0: if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs) new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
getattr(self, "register_to_config")(**new_kwargs) getattr(self, "register_to_config")(**new_kwargs)
original_init(self, *args, **kwargs) original_init(self, *args, **kwargs)

View File

@@ -1394,7 +1394,7 @@ class FromSingleFileMixin:
use_auth_token = kwargs.pop("use_auth_token", None) use_auth_token = kwargs.pop("use_auth_token", None)
revision = kwargs.pop("revision", None) revision = kwargs.pop("revision", None)
extract_ema = kwargs.pop("extract_ema", False) extract_ema = kwargs.pop("extract_ema", False)
image_size = kwargs.pop("image_size", 512) image_size = kwargs.pop("image_size", None)
scheduler_type = kwargs.pop("scheduler_type", "pndm") scheduler_type = kwargs.pop("scheduler_type", "pndm")
num_in_channels = kwargs.pop("num_in_channels", None) num_in_channels = kwargs.pop("num_in_channels", None)
upcast_attention = kwargs.pop("upcast_attention", None) upcast_attention = kwargs.pop("upcast_attention", None)

View File

@@ -1213,6 +1213,15 @@ class DiffusionPipeline(ConfigMixin):
filenames = {sibling.rfilename for sibling in info.siblings} filenames = {sibling.rfilename for sibling in info.siblings}
model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant) model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
if len(variant_filenames) == 0 and variant is not None:
deprecation_message = (
f"You are trying to load the model files of the `variant={variant}`, but no such modeling files are available."
f"The default model files: {model_filenames} will be loaded instead. Make sure to not load from `variant={variant}`"
"if such variant modeling files are not available. Doing so will lead to an error in v0.22.0 as defaulting to non-variant"
"modeling files is deprecated."
)
deprecate("no variant default", "0.22.0", deprecation_message, standard_warn=False)
# remove ignored filenames # remove ignored filenames
model_filenames = set(model_filenames) - set(ignore_filenames) model_filenames = set(model_filenames) - set(ignore_filenames)
variant_filenames = set(variant_filenames) - set(ignore_filenames) variant_filenames = set(variant_filenames) - set(ignore_filenames)
@@ -1302,7 +1311,7 @@ class DiffusionPipeline(ConfigMixin):
snapshot_folder = Path(config_file).parent snapshot_folder = Path(config_file).parent
pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files) pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
if pipeline_is_cached: if pipeline_is_cached and not force_download:
# if the pipeline is cached, we can directly return it # if the pipeline is cached, we can directly return it
# else call snapshot_download # else call snapshot_download
return snapshot_folder return snapshot_folder

View File

@@ -24,6 +24,7 @@ from transformers import (
AutoFeatureExtractor, AutoFeatureExtractor,
BertTokenizerFast, BertTokenizerFast,
CLIPImageProcessor, CLIPImageProcessor,
CLIPTextConfig,
CLIPTextModel, CLIPTextModel,
CLIPTextModelWithProjection, CLIPTextModelWithProjection,
CLIPTokenizer, CLIPTokenizer,
@@ -48,7 +49,7 @@ from ...schedulers import (
PNDMScheduler, PNDMScheduler,
UnCLIPScheduler, UnCLIPScheduler,
) )
from ...utils import is_omegaconf_available, is_safetensors_available, logging from ...utils import is_accelerate_available, is_omegaconf_available, is_safetensors_available, logging
from ...utils.import_utils import BACKENDS_MAPPING from ...utils.import_utils import BACKENDS_MAPPING
from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
from ..paint_by_example import PaintByExampleImageEncoder from ..paint_by_example import PaintByExampleImageEncoder
@@ -57,6 +58,10 @@ from .safety_checker import StableDiffusionSafetyChecker
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
if is_accelerate_available():
from accelerate import init_empty_weights
from accelerate.utils import set_module_tensor_to_device
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -770,11 +775,12 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None): def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
text_model = ( if text_encoder is None:
CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) config_name = "openai/clip-vit-large-patch14"
if text_encoder is None config = CLIPTextConfig.from_pretrained(config_name)
else text_encoder
) with init_empty_weights():
text_model = CLIPTextModel(config)
keys = list(checkpoint.keys()) keys = list(checkpoint.keys())
@@ -787,7 +793,8 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder
if key.startswith(prefix): if key.startswith(prefix):
text_model_dict[key[len(prefix + ".") :]] = checkpoint[key] text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
text_model.load_state_dict(text_model_dict) for param_name, param in text_model_dict.items():
set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
return text_model return text_model
@@ -884,14 +891,26 @@ def convert_paint_by_example_checkpoint(checkpoint):
return model return model
def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."): def convert_open_clip_checkpoint(
checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs
):
# text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder") # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
text_model = CLIPTextModelWithProjection.from_pretrained( # text_model = CLIPTextModelWithProjection.from_pretrained(
"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280 # "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
) # )
config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs)
with init_empty_weights():
text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
keys = list(checkpoint.keys()) keys = list(checkpoint.keys())
keys_to_ignore = []
if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
# make sure to remove all keys > 22
keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
keys_to_ignore += ["cond_stage_model.model.text_projection"]
text_model_dict = {} text_model_dict = {}
if prefix + "text_projection" in checkpoint: if prefix + "text_projection" in checkpoint:
@@ -902,8 +921,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids") text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
for key in keys: for key in keys:
# if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer if key in keys_to_ignore:
# continue continue
if key[len(prefix) :] in textenc_conversion_map: if key[len(prefix) :] in textenc_conversion_map:
if key.endswith("text_projection"): if key.endswith("text_projection"):
value = checkpoint[key].T value = checkpoint[key].T
@@ -931,7 +950,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
text_model_dict[new_key] = checkpoint[key] text_model_dict[new_key] = checkpoint[key]
text_model.load_state_dict(text_model_dict) for param_name, param in text_model_dict.items():
set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
return text_model return text_model
@@ -1061,7 +1081,7 @@ def convert_controlnet_checkpoint(
def download_from_original_stable_diffusion_ckpt( def download_from_original_stable_diffusion_ckpt(
checkpoint_path: str, checkpoint_path: str,
original_config_file: str = None, original_config_file: str = None,
image_size: int = 512, image_size: Optional[int] = None,
prediction_type: str = None, prediction_type: str = None,
model_type: str = None, model_type: str = None,
extract_ema: bool = False, extract_ema: bool = False,
@@ -1144,6 +1164,7 @@ def download_from_original_stable_diffusion_ckpt(
LDMTextToImagePipeline, LDMTextToImagePipeline,
PaintByExamplePipeline, PaintByExamplePipeline,
StableDiffusionControlNetPipeline, StableDiffusionControlNetPipeline,
StableDiffusionInpaintPipeline,
StableDiffusionPipeline, StableDiffusionPipeline,
StableDiffusionXLImg2ImgPipeline, StableDiffusionXLImg2ImgPipeline,
StableDiffusionXLPipeline, StableDiffusionXLPipeline,
@@ -1166,12 +1187,9 @@ def download_from_original_stable_diffusion_ckpt(
if not is_safetensors_available(): if not is_safetensors_available():
raise ValueError(BACKENDS_MAPPING["safetensors"][1]) raise ValueError(BACKENDS_MAPPING["safetensors"][1])
from safetensors import safe_open from safetensors.torch import load_file as safe_load
checkpoint = {} checkpoint = safe_load(checkpoint_path, device="cpu")
with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
for key in f.keys():
checkpoint[key] = f.get_tensor(key)
else: else:
if device is None: if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu" device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -1183,7 +1201,7 @@ def download_from_original_stable_diffusion_ckpt(
if "global_step" in checkpoint: if "global_step" in checkpoint:
global_step = checkpoint["global_step"] global_step = checkpoint["global_step"]
else: else:
logger.warning("global_step key not found in model") logger.debug("global_step key not found in model")
global_step = None global_step = None
# NOTE: this while loop isn't great but this controlnet checkpoint has one additional # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
@@ -1230,8 +1248,15 @@ def download_from_original_stable_diffusion_ckpt(
model_type = "SDXL" model_type = "SDXL"
else: else:
model_type = "SDXL-Refiner" model_type = "SDXL-Refiner"
if image_size is None:
image_size = 1024
if num_in_channels is not None: if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline:
num_in_channels = 9
elif num_in_channels is None:
num_in_channels = 4
if "unet_config" in original_config.model.params:
original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
if ( if (
@@ -1263,7 +1288,6 @@ def download_from_original_stable_diffusion_ckpt(
num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000 num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
if model_type in ["SDXL", "SDXL-Refiner"]: if model_type in ["SDXL", "SDXL-Refiner"]:
image_size = 1024
scheduler_dict = { scheduler_dict = {
"beta_schedule": "scaled_linear", "beta_schedule": "scaled_linear",
"beta_start": 0.00085, "beta_start": 0.00085,
@@ -1279,7 +1303,6 @@ def download_from_original_stable_diffusion_ckpt(
} }
scheduler = EulerDiscreteScheduler.from_config(scheduler_dict) scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
scheduler_type = "euler" scheduler_type = "euler"
vae_path = "stabilityai/sdxl-vae"
else: else:
beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02 beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085 beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
@@ -1318,25 +1341,45 @@ def download_from_original_stable_diffusion_ckpt(
# Convert the UNet2DConditionModel model. # Convert the UNet2DConditionModel model.
unet_config = create_unet_diffusers_config(original_config, image_size=image_size) unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
unet_config["upcast_attention"] = upcast_attention unet_config["upcast_attention"] = upcast_attention
unet = UNet2DConditionModel(**unet_config) with init_empty_weights():
unet = UNet2DConditionModel(**unet_config)
converted_unet_checkpoint = convert_ldm_unet_checkpoint( converted_unet_checkpoint = convert_ldm_unet_checkpoint(
checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
) )
unet.load_state_dict(converted_unet_checkpoint)
for param_name, param in converted_unet_checkpoint.items():
set_module_tensor_to_device(unet, param_name, "cpu", value=param)
# Convert the VAE model. # Convert the VAE model.
if vae_path is None: if vae_path is None:
vae_config = create_vae_diffusers_config(original_config, image_size=image_size) vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
vae = AutoencoderKL(**vae_config) if (
vae.load_state_dict(converted_vae_checkpoint) "model" in original_config
and "params" in original_config.model
and "scale_factor" in original_config.model.params
):
vae_scaling_factor = original_config.model.params.scale_factor
else:
vae_scaling_factor = 0.18215 # default SD scaling factor
vae_config["scaling_factor"] = vae_scaling_factor
with init_empty_weights():
vae = AutoencoderKL(**vae_config)
for param_name, param in converted_vae_checkpoint.items():
set_module_tensor_to_device(vae, param_name, "cpu", value=param)
else: else:
vae = AutoencoderKL.from_pretrained(vae_path) vae = AutoencoderKL.from_pretrained(vae_path)
if model_type == "FrozenOpenCLIPEmbedder": if model_type == "FrozenOpenCLIPEmbedder":
text_model = convert_open_clip_checkpoint(checkpoint) config_name = "stabilityai/stable-diffusion-2"
config_kwargs = {"subfolder": "text_encoder"}
text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer") tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
if stable_unclip is None: if stable_unclip is None:
@@ -1469,7 +1512,12 @@ def download_from_original_stable_diffusion_ckpt(
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!") tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.1.model.")
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
config_kwargs = {"projection_dim": 1280}
text_encoder_2 = convert_open_clip_checkpoint(
checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs
)
pipe = StableDiffusionXLPipeline( pipe = StableDiffusionXLPipeline(
vae=vae, vae=vae,
@@ -1485,7 +1533,12 @@ def download_from_original_stable_diffusion_ckpt(
tokenizer = None tokenizer = None
text_encoder = None text_encoder = None
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!") tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.0.model.")
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
config_kwargs = {"projection_dim": 1280}
text_encoder_2 = convert_open_clip_checkpoint(
checkpoint, config_name, prefix="conditioner.embedders.0.model.", has_projection=True, **config_kwargs
)
pipe = StableDiffusionXLImg2ImgPipeline( pipe = StableDiffusionXLImg2ImgPipeline(
vae=vae, vae=vae,

View File

@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict from ...configuration_utils import FrozenDict
from ...image_processor import VaeImageProcessor from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
@@ -153,7 +153,9 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
return mask, masked_image return mask, masked_image
class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): class StableDiffusionInpaintPipeline(
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r""" r"""
Pipeline for text-guided image inpainting using Stable Diffusion. Pipeline for text-guided image inpainting using Stable Diffusion.

View File

@@ -748,15 +748,19 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
# make sure the VAE is in float32 mode, as it overflows in float16 # make sure the VAE is in float32 mode, as it overflows in float16
self.vae.to(dtype=torch.float32) self.vae.to(dtype=torch.float32)
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [ use_torch_2_0_or_xformers = isinstance(
AttnProcessor2_0, self.vae.decoder.mid_block.attentions[0].processor,
XFormersAttnProcessor, (
LoRAXFormersAttnProcessor, AttnProcessor2_0,
LoRAAttnProcessor2_0, XFormersAttnProcessor,
] LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
),
)
# if xformers or torch_2_0 is used attention block does not need # if xformers or torch_2_0 is used attention block does not need
# to be in float32 which can save lots of memory # to be in float32 which can save lots of memory
if not use_torch_2_0_or_xformers: if use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(latents.dtype) self.vae.post_quant_conv.to(latents.dtype)
self.vae.decoder.conv_in.to(latents.dtype) self.vae.decoder.conv_in.to(latents.dtype)
self.vae.decoder.mid_block.to(latents.dtype) self.vae.decoder.mid_block.to(latents.dtype)

View File

@@ -129,6 +129,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.default_sample_size = self.unet.config.sample_size
self.watermark = StableDiffusionXLWatermarker() self.watermark = StableDiffusionXLWatermarker()
@@ -652,8 +653,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
"not-safe-for-work" (nsfw) content, according to the `safety_checker`. "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
""" """
# 0. Default height and width to unet # 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor height = height or self.default_sample_size * self.vae_scale_factor
width = width or self.unet.config.sample_size * self.vae_scale_factor width = width or self.default_sample_size * self.vae_scale_factor
original_size = original_size or (height, width) original_size = original_size or (height, width)
target_size = target_size or (height, width) target_size = target_size or (height, width)
@@ -785,15 +786,18 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
# make sure the VAE is in float32 mode, as it overflows in float16 # make sure the VAE is in float32 mode, as it overflows in float16
self.vae.to(dtype=torch.float32) self.vae.to(dtype=torch.float32)
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [ use_torch_2_0_or_xformers = isinstance(
AttnProcessor2_0, self.vae.decoder.mid_block.attentions[0].processor,
XFormersAttnProcessor, (
LoRAXFormersAttnProcessor, AttnProcessor2_0,
LoRAAttnProcessor2_0, XFormersAttnProcessor,
] LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
),
)
# if xformers or torch_2_0 is used attention block does not need # if xformers or torch_2_0 is used attention block does not need
# to be in float32 which can save lots of memory # to be in float32 which can save lots of memory
if not use_torch_2_0_or_xformers: if use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(latents.dtype) self.vae.post_quant_conv.to(latents.dtype)
self.vae.decoder.conv_in.to(latents.dtype) self.vae.decoder.conv_in.to(latents.dtype)
self.vae.decoder.mid_block.to(latents.dtype) self.vae.decoder.mid_block.to(latents.dtype)

View File

@@ -859,15 +859,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
# make sure the VAE is in float32 mode, as it overflows in float16 # make sure the VAE is in float32 mode, as it overflows in float16
self.vae.to(dtype=torch.float32) self.vae.to(dtype=torch.float32)
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [ use_torch_2_0_or_xformers = isinstance(
AttnProcessor2_0, self.vae.decoder.mid_block.attentions[0].processor,
XFormersAttnProcessor, (
LoRAXFormersAttnProcessor, AttnProcessor2_0,
LoRAAttnProcessor2_0, XFormersAttnProcessor,
] LoRAXFormersAttnProcessor,
LoRAAttnProcessor2_0,
),
)
# if xformers or torch_2_0 is used attention block does not need # if xformers or torch_2_0 is used attention block does not need
# to be in float32 which can save lots of memory # to be in float32 which can save lots of memory
if not use_torch_2_0_or_xformers: if use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(latents.dtype) self.vae.post_quant_conv.to(latents.dtype)
self.vae.decoder.conv_in.to(latents.dtype) self.vae.decoder.conv_in.to(latents.dtype)
self.vae.decoder.mid_block.to(latents.dtype) self.vae.decoder.mid_block.to(latents.dtype)

View File

@@ -264,7 +264,7 @@ class ConfigTester(unittest.TestCase):
config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")} config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")}
# make sure that default config has all keys in `_use_default_values` # make sure that default config has all keys in `_use_default_values`
assert set(config_dict.keys()) == config.config._use_default_values assert set(config_dict.keys()) == set(config.config._use_default_values)
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
config.save_config(tmpdirname) config.save_config(tmpdirname)

View File

@@ -20,17 +20,20 @@ import unittest
import numpy as np import numpy as np
import torch import torch
from huggingface_hub import hf_hub_download
from PIL import Image from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import ( from diffusers import (
AutoencoderKL, AutoencoderKL,
DDIMScheduler,
DPMSolverMultistepScheduler, DPMSolverMultistepScheduler,
LMSDiscreteScheduler, LMSDiscreteScheduler,
PNDMScheduler, PNDMScheduler,
StableDiffusionInpaintPipeline, StableDiffusionInpaintPipeline,
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.models.attention_processor import AttnProcessor
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
@@ -512,6 +515,42 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
assert np.abs(expected_slice - image_slice).max() < 6e-4 assert np.abs(expected_slice - image_slice).max() < 6e-4
def test_download_local(self):
filename = hf_hub_download("runwayml/stable-diffusion-inpainting", filename="sd-v1-5-inpainting.ckpt")
pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 1
image_out = pipe(**inputs).images[0]
assert image_out.shape == (512, 512, 3)
def test_download_ckpt_diff_format_is_same(self):
ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.unet.set_attn_processor(AttnProcessor())
pipe.to("cuda")
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 5
image_ckpt = pipe(**inputs).images[0]
pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.unet.set_attn_processor(AttnProcessor())
pipe.to("cuda")
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 5
image = pipe(**inputs).images[0]
assert np.max(np.abs(image - image_ckpt)) < 1e-4
@nightly @nightly
@require_torch_gpu @require_torch_gpu

View File

@@ -19,6 +19,7 @@ import unittest
import numpy as np import numpy as np
import torch import torch
from huggingface_hub import hf_hub_download
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import ( from diffusers import (
@@ -29,6 +30,7 @@ from diffusers import (
StableDiffusionPipeline, StableDiffusionPipeline,
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.models.attention_processor import AttnProcessor
from diffusers.utils import load_numpy, slow, torch_device from diffusers.utils import load_numpy, slow, torch_device
from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
@@ -426,6 +428,40 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
assert image.shape == (768, 768, 3) assert image.shape == (768, 768, 3)
assert np.abs(expected_image - image).max() < 7.5e-1 assert np.abs(expected_image - image).max() < 7.5e-1
def test_download_local(self):
filename = hf_hub_download("stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.safetensors")
pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
assert image_out.shape == (768, 768, 3)
def test_download_ckpt_diff_format_is_same(self):
single_file_path = (
"https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
)
pipe_single = StableDiffusionPipeline.from_single_file(single_file_path)
pipe_single.scheduler = DDIMScheduler.from_config(pipe_single.scheduler.config)
pipe_single.unet.set_attn_processor(AttnProcessor())
pipe_single.to("cuda")
generator = torch.Generator(device="cpu").manual_seed(0)
image_ckpt = pipe_single("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.unet.set_attn_processor(AttnProcessor())
pipe.to("cuda")
generator = torch.Generator(device="cpu").manual_seed(0)
image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
assert np.max(np.abs(image - image_ckpt)) < 1e-3
def test_stable_diffusion_text2img_intermediate_state_v_pred(self): def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
number_of_steps = 0 number_of_steps = 0

View File

@@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
import gc import gc
import glob
import json import json
import os import os
import random import random
@@ -56,6 +57,7 @@ from diffusers import (
UniPCMultistepScheduler, UniPCMultistepScheduler,
logging, logging,
) )
from diffusers.pipelines.pipeline_utils import variant_compatible_siblings
from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
from diffusers.utils import ( from diffusers.utils import (
CONFIG_NAME, CONFIG_NAME,
@@ -1361,6 +1363,29 @@ class PipelineFastTests(unittest.TestCase):
assert sd.config.safety_checker != (None, None) assert sd.config.safety_checker != (None, None)
assert sd.config.feature_extractor != (None, None) assert sd.config.feature_extractor != (None, None)
def test_warning_no_variant_available(self):
variant = "fp16"
with self.assertWarns(FutureWarning) as warning_context:
cached_folder = StableDiffusionPipeline.download(
"hf-internal-testing/diffusers-stable-diffusion-tiny-all", variant=variant
)
assert "but no such modeling files are available" in str(warning_context.warning)
assert variant in str(warning_context.warning)
def get_all_filenames(directory):
filenames = glob.glob(directory + "/**", recursive=True)
filenames = [f for f in filenames if os.path.isfile(f)]
return filenames
filenames = get_all_filenames(str(cached_folder))
all_model_files, variant_model_files = variant_compatible_siblings(filenames, variant=variant)
# make sure that none of the model names are variant model names
assert len(variant_model_files) == 0
assert len(all_model_files) > 0
@slow @slow
@require_torch_gpu @require_torch_gpu