mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-12 15:34:17 +08:00
Compare commits
11 Commits
v0.29.1
...
v0.18.2-pa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e80827369 | ||
|
|
6894056e46 | ||
|
|
c4402daff1 | ||
|
|
a2fa787121 | ||
|
|
ad7befeb2f | ||
|
|
1f392ad45b | ||
|
|
fe5034c540 | ||
|
|
0f5e6454dc | ||
|
|
638d2bbcd9 | ||
|
|
4dfcfaa137 | ||
|
|
1c0f6bb2cf |
@@ -21,7 +21,7 @@ The abstract of the paper is the following:
|
|||||||
## Tips
|
## Tips
|
||||||
|
|
||||||
- Stable Diffusion XL works especially well with images between 768 and 1024.
|
- Stable Diffusion XL works especially well with images between 768 and 1024.
|
||||||
- Stable Diffusion XL output image can be improved by making use of a refiner as shown below
|
- Stable Diffusion XL output image can be improved by making use of a refiner as shown below.
|
||||||
|
|
||||||
### Available checkpoints:
|
### Available checkpoints:
|
||||||
|
|
||||||
@@ -40,7 +40,7 @@ pip install safetensors
|
|||||||
pip install invisible-watermark>=2.0
|
pip install invisible-watermark>=2.0
|
||||||
```
|
```
|
||||||
|
|
||||||
### *Text-to-Image*
|
### Text-to-Image
|
||||||
|
|
||||||
You can use SDXL as follows for *text-to-image*:
|
You can use SDXL as follows for *text-to-image*:
|
||||||
|
|
||||||
@@ -71,6 +71,7 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
|
|||||||
)
|
)
|
||||||
pipe.to("cuda")
|
pipe.to("cuda")
|
||||||
|
|
||||||
|
use_refiner = True
|
||||||
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
|
refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
|
||||||
"stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
|
"stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
|
||||||
)
|
)
|
||||||
@@ -82,7 +83,29 @@ image = pipe(prompt=prompt, output_type="latent" if use_refiner else "pil").imag
|
|||||||
image = refiner(prompt=prompt, image=image[None, :]).images[0]
|
image = refiner(prompt=prompt, image=image[None, :]).images[0]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Loading single file checkpoitns / original file format
|
### Image-to-image
|
||||||
|
|
||||||
|
```py
|
||||||
|
import torch
|
||||||
|
from diffusers import StableDiffusionXLImg2ImgPipeline
|
||||||
|
from diffusers.utils import load_image
|
||||||
|
|
||||||
|
pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
|
||||||
|
"stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16
|
||||||
|
)
|
||||||
|
pipe = pipe.to("cuda")
|
||||||
|
url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
|
||||||
|
|
||||||
|
init_image = load_image(url).convert("RGB")
|
||||||
|
prompt = "a photo of an astronaut riding a horse on mars"
|
||||||
|
image = pipe(prompt, image=init_image).images[0]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Original Image | Refined Image |
|
||||||
|
|---|---|
|
||||||
|
|  |  |
|
||||||
|
|
||||||
|
### Loading single file checkpoints / original file format
|
||||||
|
|
||||||
By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the
|
By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the
|
||||||
original file format into `diffusers`:
|
original file format into `diffusers`:
|
||||||
@@ -127,7 +150,7 @@ You can speed up inference by making use of `torch.compile`. This should give yo
|
|||||||
+ refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True)
|
+ refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Running with `torch` < 2.0
|
### Running with `torch` \< 2.0
|
||||||
|
|
||||||
**Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers
|
**Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers
|
||||||
attention:
|
attention:
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ if is_wandb_available():
|
|||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ if is_wandb_available():
|
|||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ if is_wandb_available():
|
|||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ from diffusers.utils import check_min_version
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
# Cache compiled models across invocations of this script.
|
# Cache compiled models across invocations of this script.
|
||||||
cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
|
cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__, log_level="INFO")
|
logger = get_logger(__name__, log_level="INFO")
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ if is_wandb_available():
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__, log_level="INFO")
|
logger = get_logger(__name__, log_level="INFO")
|
||||||
|
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from diffusers.utils import check_min_version
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__, log_level="INFO")
|
logger = get_logger(__name__, log_level="INFO")
|
||||||
|
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ else:
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ else:
|
|||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available
|
|||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
||||||
check_min_version("0.18.0.dev0")
|
check_min_version("0.18.0")
|
||||||
|
|
||||||
logger = get_logger(__name__, log_level="INFO")
|
logger = get_logger(__name__, log_level="INFO")
|
||||||
|
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -232,7 +232,7 @@ install_requires = [
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="diffusers",
|
name="diffusers",
|
||||||
version="0.18.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
version="0.18.2", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||||
description="Diffusers",
|
description="Diffusers",
|
||||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
__version__ = "0.18.0.dev0"
|
__version__ = "0.18.2"
|
||||||
|
|
||||||
from .configuration_utils import ConfigMixin
|
from .configuration_utils import ConfigMixin
|
||||||
from .utils import (
|
from .utils import (
|
||||||
|
|||||||
@@ -607,7 +607,7 @@ def register_to_config(init):
|
|||||||
|
|
||||||
# Take note of the parameters that were not present in the loaded config
|
# Take note of the parameters that were not present in the loaded config
|
||||||
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
|
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
|
||||||
new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)
|
new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
|
||||||
|
|
||||||
new_kwargs = {**config_init_kwargs, **new_kwargs}
|
new_kwargs = {**config_init_kwargs, **new_kwargs}
|
||||||
getattr(self, "register_to_config")(**new_kwargs)
|
getattr(self, "register_to_config")(**new_kwargs)
|
||||||
@@ -655,7 +655,7 @@ def flax_register_to_config(cls):
|
|||||||
|
|
||||||
# Take note of the parameters that were not present in the loaded config
|
# Take note of the parameters that were not present in the loaded config
|
||||||
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
|
if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
|
||||||
new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)
|
new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
|
||||||
|
|
||||||
getattr(self, "register_to_config")(**new_kwargs)
|
getattr(self, "register_to_config")(**new_kwargs)
|
||||||
original_init(self, *args, **kwargs)
|
original_init(self, *args, **kwargs)
|
||||||
|
|||||||
@@ -1394,7 +1394,7 @@ class FromSingleFileMixin:
|
|||||||
use_auth_token = kwargs.pop("use_auth_token", None)
|
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||||
revision = kwargs.pop("revision", None)
|
revision = kwargs.pop("revision", None)
|
||||||
extract_ema = kwargs.pop("extract_ema", False)
|
extract_ema = kwargs.pop("extract_ema", False)
|
||||||
image_size = kwargs.pop("image_size", 512)
|
image_size = kwargs.pop("image_size", None)
|
||||||
scheduler_type = kwargs.pop("scheduler_type", "pndm")
|
scheduler_type = kwargs.pop("scheduler_type", "pndm")
|
||||||
num_in_channels = kwargs.pop("num_in_channels", None)
|
num_in_channels = kwargs.pop("num_in_channels", None)
|
||||||
upcast_attention = kwargs.pop("upcast_attention", None)
|
upcast_attention = kwargs.pop("upcast_attention", None)
|
||||||
|
|||||||
@@ -1213,6 +1213,15 @@ class DiffusionPipeline(ConfigMixin):
|
|||||||
filenames = {sibling.rfilename for sibling in info.siblings}
|
filenames = {sibling.rfilename for sibling in info.siblings}
|
||||||
model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
|
model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
|
||||||
|
|
||||||
|
if len(variant_filenames) == 0 and variant is not None:
|
||||||
|
deprecation_message = (
|
||||||
|
f"You are trying to load the model files of the `variant={variant}`, but no such modeling files are available."
|
||||||
|
f"The default model files: {model_filenames} will be loaded instead. Make sure to not load from `variant={variant}`"
|
||||||
|
"if such variant modeling files are not available. Doing so will lead to an error in v0.22.0 as defaulting to non-variant"
|
||||||
|
"modeling files is deprecated."
|
||||||
|
)
|
||||||
|
deprecate("no variant default", "0.22.0", deprecation_message, standard_warn=False)
|
||||||
|
|
||||||
# remove ignored filenames
|
# remove ignored filenames
|
||||||
model_filenames = set(model_filenames) - set(ignore_filenames)
|
model_filenames = set(model_filenames) - set(ignore_filenames)
|
||||||
variant_filenames = set(variant_filenames) - set(ignore_filenames)
|
variant_filenames = set(variant_filenames) - set(ignore_filenames)
|
||||||
@@ -1302,7 +1311,7 @@ class DiffusionPipeline(ConfigMixin):
|
|||||||
snapshot_folder = Path(config_file).parent
|
snapshot_folder = Path(config_file).parent
|
||||||
pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
|
pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
|
||||||
|
|
||||||
if pipeline_is_cached:
|
if pipeline_is_cached and not force_download:
|
||||||
# if the pipeline is cached, we can directly return it
|
# if the pipeline is cached, we can directly return it
|
||||||
# else call snapshot_download
|
# else call snapshot_download
|
||||||
return snapshot_folder
|
return snapshot_folder
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from transformers import (
|
|||||||
AutoFeatureExtractor,
|
AutoFeatureExtractor,
|
||||||
BertTokenizerFast,
|
BertTokenizerFast,
|
||||||
CLIPImageProcessor,
|
CLIPImageProcessor,
|
||||||
|
CLIPTextConfig,
|
||||||
CLIPTextModel,
|
CLIPTextModel,
|
||||||
CLIPTextModelWithProjection,
|
CLIPTextModelWithProjection,
|
||||||
CLIPTokenizer,
|
CLIPTokenizer,
|
||||||
@@ -48,7 +49,7 @@ from ...schedulers import (
|
|||||||
PNDMScheduler,
|
PNDMScheduler,
|
||||||
UnCLIPScheduler,
|
UnCLIPScheduler,
|
||||||
)
|
)
|
||||||
from ...utils import is_omegaconf_available, is_safetensors_available, logging
|
from ...utils import is_accelerate_available, is_omegaconf_available, is_safetensors_available, logging
|
||||||
from ...utils.import_utils import BACKENDS_MAPPING
|
from ...utils.import_utils import BACKENDS_MAPPING
|
||||||
from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
|
from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
|
||||||
from ..paint_by_example import PaintByExampleImageEncoder
|
from ..paint_by_example import PaintByExampleImageEncoder
|
||||||
@@ -57,6 +58,10 @@ from .safety_checker import StableDiffusionSafetyChecker
|
|||||||
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
|
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
|
||||||
|
|
||||||
|
|
||||||
|
if is_accelerate_available():
|
||||||
|
from accelerate import init_empty_weights
|
||||||
|
from accelerate.utils import set_module_tensor_to_device
|
||||||
|
|
||||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||||
|
|
||||||
|
|
||||||
@@ -770,11 +775,12 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
|
|||||||
|
|
||||||
|
|
||||||
def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
|
def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
|
||||||
text_model = (
|
if text_encoder is None:
|
||||||
CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
|
config_name = "openai/clip-vit-large-patch14"
|
||||||
if text_encoder is None
|
config = CLIPTextConfig.from_pretrained(config_name)
|
||||||
else text_encoder
|
|
||||||
)
|
with init_empty_weights():
|
||||||
|
text_model = CLIPTextModel(config)
|
||||||
|
|
||||||
keys = list(checkpoint.keys())
|
keys = list(checkpoint.keys())
|
||||||
|
|
||||||
@@ -787,7 +793,8 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder
|
|||||||
if key.startswith(prefix):
|
if key.startswith(prefix):
|
||||||
text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
|
text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
|
||||||
|
|
||||||
text_model.load_state_dict(text_model_dict)
|
for param_name, param in text_model_dict.items():
|
||||||
|
set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
|
||||||
|
|
||||||
return text_model
|
return text_model
|
||||||
|
|
||||||
@@ -884,14 +891,26 @@ def convert_paint_by_example_checkpoint(checkpoint):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
|
def convert_open_clip_checkpoint(
|
||||||
|
checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs
|
||||||
|
):
|
||||||
# text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
|
# text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
|
||||||
text_model = CLIPTextModelWithProjection.from_pretrained(
|
# text_model = CLIPTextModelWithProjection.from_pretrained(
|
||||||
"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
|
# "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
|
||||||
)
|
# )
|
||||||
|
config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs)
|
||||||
|
|
||||||
|
with init_empty_weights():
|
||||||
|
text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
|
||||||
|
|
||||||
keys = list(checkpoint.keys())
|
keys = list(checkpoint.keys())
|
||||||
|
|
||||||
|
keys_to_ignore = []
|
||||||
|
if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
|
||||||
|
# make sure to remove all keys > 22
|
||||||
|
keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
|
||||||
|
keys_to_ignore += ["cond_stage_model.model.text_projection"]
|
||||||
|
|
||||||
text_model_dict = {}
|
text_model_dict = {}
|
||||||
|
|
||||||
if prefix + "text_projection" in checkpoint:
|
if prefix + "text_projection" in checkpoint:
|
||||||
@@ -902,8 +921,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
|
|||||||
text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
|
text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
|
||||||
|
|
||||||
for key in keys:
|
for key in keys:
|
||||||
# if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer
|
if key in keys_to_ignore:
|
||||||
# continue
|
continue
|
||||||
if key[len(prefix) :] in textenc_conversion_map:
|
if key[len(prefix) :] in textenc_conversion_map:
|
||||||
if key.endswith("text_projection"):
|
if key.endswith("text_projection"):
|
||||||
value = checkpoint[key].T
|
value = checkpoint[key].T
|
||||||
@@ -931,7 +950,8 @@ def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
|
|||||||
|
|
||||||
text_model_dict[new_key] = checkpoint[key]
|
text_model_dict[new_key] = checkpoint[key]
|
||||||
|
|
||||||
text_model.load_state_dict(text_model_dict)
|
for param_name, param in text_model_dict.items():
|
||||||
|
set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
|
||||||
|
|
||||||
return text_model
|
return text_model
|
||||||
|
|
||||||
@@ -1061,7 +1081,7 @@ def convert_controlnet_checkpoint(
|
|||||||
def download_from_original_stable_diffusion_ckpt(
|
def download_from_original_stable_diffusion_ckpt(
|
||||||
checkpoint_path: str,
|
checkpoint_path: str,
|
||||||
original_config_file: str = None,
|
original_config_file: str = None,
|
||||||
image_size: int = 512,
|
image_size: Optional[int] = None,
|
||||||
prediction_type: str = None,
|
prediction_type: str = None,
|
||||||
model_type: str = None,
|
model_type: str = None,
|
||||||
extract_ema: bool = False,
|
extract_ema: bool = False,
|
||||||
@@ -1144,6 +1164,7 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
LDMTextToImagePipeline,
|
LDMTextToImagePipeline,
|
||||||
PaintByExamplePipeline,
|
PaintByExamplePipeline,
|
||||||
StableDiffusionControlNetPipeline,
|
StableDiffusionControlNetPipeline,
|
||||||
|
StableDiffusionInpaintPipeline,
|
||||||
StableDiffusionPipeline,
|
StableDiffusionPipeline,
|
||||||
StableDiffusionXLImg2ImgPipeline,
|
StableDiffusionXLImg2ImgPipeline,
|
||||||
StableDiffusionXLPipeline,
|
StableDiffusionXLPipeline,
|
||||||
@@ -1166,12 +1187,9 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
if not is_safetensors_available():
|
if not is_safetensors_available():
|
||||||
raise ValueError(BACKENDS_MAPPING["safetensors"][1])
|
raise ValueError(BACKENDS_MAPPING["safetensors"][1])
|
||||||
|
|
||||||
from safetensors import safe_open
|
from safetensors.torch import load_file as safe_load
|
||||||
|
|
||||||
checkpoint = {}
|
checkpoint = safe_load(checkpoint_path, device="cpu")
|
||||||
with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
|
|
||||||
for key in f.keys():
|
|
||||||
checkpoint[key] = f.get_tensor(key)
|
|
||||||
else:
|
else:
|
||||||
if device is None:
|
if device is None:
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
@@ -1183,7 +1201,7 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
if "global_step" in checkpoint:
|
if "global_step" in checkpoint:
|
||||||
global_step = checkpoint["global_step"]
|
global_step = checkpoint["global_step"]
|
||||||
else:
|
else:
|
||||||
logger.warning("global_step key not found in model")
|
logger.debug("global_step key not found in model")
|
||||||
global_step = None
|
global_step = None
|
||||||
|
|
||||||
# NOTE: this while loop isn't great but this controlnet checkpoint has one additional
|
# NOTE: this while loop isn't great but this controlnet checkpoint has one additional
|
||||||
@@ -1230,8 +1248,15 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
model_type = "SDXL"
|
model_type = "SDXL"
|
||||||
else:
|
else:
|
||||||
model_type = "SDXL-Refiner"
|
model_type = "SDXL-Refiner"
|
||||||
|
if image_size is None:
|
||||||
|
image_size = 1024
|
||||||
|
|
||||||
if num_in_channels is not None:
|
if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline:
|
||||||
|
num_in_channels = 9
|
||||||
|
elif num_in_channels is None:
|
||||||
|
num_in_channels = 4
|
||||||
|
|
||||||
|
if "unet_config" in original_config.model.params:
|
||||||
original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
|
original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -1263,7 +1288,6 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
|
num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
|
||||||
|
|
||||||
if model_type in ["SDXL", "SDXL-Refiner"]:
|
if model_type in ["SDXL", "SDXL-Refiner"]:
|
||||||
image_size = 1024
|
|
||||||
scheduler_dict = {
|
scheduler_dict = {
|
||||||
"beta_schedule": "scaled_linear",
|
"beta_schedule": "scaled_linear",
|
||||||
"beta_start": 0.00085,
|
"beta_start": 0.00085,
|
||||||
@@ -1279,7 +1303,6 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
}
|
}
|
||||||
scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
|
scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
|
||||||
scheduler_type = "euler"
|
scheduler_type = "euler"
|
||||||
vae_path = "stabilityai/sdxl-vae"
|
|
||||||
else:
|
else:
|
||||||
beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
|
beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
|
||||||
beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
|
beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
|
||||||
@@ -1318,25 +1341,45 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
# Convert the UNet2DConditionModel model.
|
# Convert the UNet2DConditionModel model.
|
||||||
unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
|
unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
|
||||||
unet_config["upcast_attention"] = upcast_attention
|
unet_config["upcast_attention"] = upcast_attention
|
||||||
unet = UNet2DConditionModel(**unet_config)
|
with init_empty_weights():
|
||||||
|
unet = UNet2DConditionModel(**unet_config)
|
||||||
|
|
||||||
converted_unet_checkpoint = convert_ldm_unet_checkpoint(
|
converted_unet_checkpoint = convert_ldm_unet_checkpoint(
|
||||||
checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
|
checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
|
||||||
)
|
)
|
||||||
unet.load_state_dict(converted_unet_checkpoint)
|
|
||||||
|
for param_name, param in converted_unet_checkpoint.items():
|
||||||
|
set_module_tensor_to_device(unet, param_name, "cpu", value=param)
|
||||||
|
|
||||||
# Convert the VAE model.
|
# Convert the VAE model.
|
||||||
if vae_path is None:
|
if vae_path is None:
|
||||||
vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
|
vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
|
||||||
converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
|
converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
|
||||||
|
|
||||||
vae = AutoencoderKL(**vae_config)
|
if (
|
||||||
vae.load_state_dict(converted_vae_checkpoint)
|
"model" in original_config
|
||||||
|
and "params" in original_config.model
|
||||||
|
and "scale_factor" in original_config.model.params
|
||||||
|
):
|
||||||
|
vae_scaling_factor = original_config.model.params.scale_factor
|
||||||
|
else:
|
||||||
|
vae_scaling_factor = 0.18215 # default SD scaling factor
|
||||||
|
|
||||||
|
vae_config["scaling_factor"] = vae_scaling_factor
|
||||||
|
|
||||||
|
with init_empty_weights():
|
||||||
|
vae = AutoencoderKL(**vae_config)
|
||||||
|
|
||||||
|
for param_name, param in converted_vae_checkpoint.items():
|
||||||
|
set_module_tensor_to_device(vae, param_name, "cpu", value=param)
|
||||||
else:
|
else:
|
||||||
vae = AutoencoderKL.from_pretrained(vae_path)
|
vae = AutoencoderKL.from_pretrained(vae_path)
|
||||||
|
|
||||||
if model_type == "FrozenOpenCLIPEmbedder":
|
if model_type == "FrozenOpenCLIPEmbedder":
|
||||||
text_model = convert_open_clip_checkpoint(checkpoint)
|
config_name = "stabilityai/stable-diffusion-2"
|
||||||
|
config_kwargs = {"subfolder": "text_encoder"}
|
||||||
|
|
||||||
|
text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
|
||||||
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
|
tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")
|
||||||
|
|
||||||
if stable_unclip is None:
|
if stable_unclip is None:
|
||||||
@@ -1469,7 +1512,12 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
||||||
text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
|
text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
|
||||||
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
|
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
|
||||||
text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.1.model.")
|
|
||||||
|
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
|
||||||
|
config_kwargs = {"projection_dim": 1280}
|
||||||
|
text_encoder_2 = convert_open_clip_checkpoint(
|
||||||
|
checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs
|
||||||
|
)
|
||||||
|
|
||||||
pipe = StableDiffusionXLPipeline(
|
pipe = StableDiffusionXLPipeline(
|
||||||
vae=vae,
|
vae=vae,
|
||||||
@@ -1485,7 +1533,12 @@ def download_from_original_stable_diffusion_ckpt(
|
|||||||
tokenizer = None
|
tokenizer = None
|
||||||
text_encoder = None
|
text_encoder = None
|
||||||
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
|
tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
|
||||||
text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.0.model.")
|
|
||||||
|
config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
|
||||||
|
config_kwargs = {"projection_dim": 1280}
|
||||||
|
text_encoder_2 = convert_open_clip_checkpoint(
|
||||||
|
checkpoint, config_name, prefix="conditioner.embedders.0.model.", has_projection=True, **config_kwargs
|
||||||
|
)
|
||||||
|
|
||||||
pipe = StableDiffusionXLImg2ImgPipeline(
|
pipe = StableDiffusionXLImg2ImgPipeline(
|
||||||
vae=vae,
|
vae=vae,
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
|||||||
|
|
||||||
from ...configuration_utils import FrozenDict
|
from ...configuration_utils import FrozenDict
|
||||||
from ...image_processor import VaeImageProcessor
|
from ...image_processor import VaeImageProcessor
|
||||||
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
|
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
||||||
from ...models import AutoencoderKL, UNet2DConditionModel
|
from ...models import AutoencoderKL, UNet2DConditionModel
|
||||||
from ...schedulers import KarrasDiffusionSchedulers
|
from ...schedulers import KarrasDiffusionSchedulers
|
||||||
from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
|
from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
|
||||||
@@ -153,7 +153,9 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
|
|||||||
return mask, masked_image
|
return mask, masked_image
|
||||||
|
|
||||||
|
|
||||||
class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
|
class StableDiffusionInpaintPipeline(
|
||||||
|
DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
|
||||||
|
):
|
||||||
r"""
|
r"""
|
||||||
Pipeline for text-guided image inpainting using Stable Diffusion.
|
Pipeline for text-guided image inpainting using Stable Diffusion.
|
||||||
|
|
||||||
|
|||||||
@@ -748,15 +748,19 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
|
|||||||
# make sure the VAE is in float32 mode, as it overflows in float16
|
# make sure the VAE is in float32 mode, as it overflows in float16
|
||||||
self.vae.to(dtype=torch.float32)
|
self.vae.to(dtype=torch.float32)
|
||||||
|
|
||||||
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
|
use_torch_2_0_or_xformers = isinstance(
|
||||||
AttnProcessor2_0,
|
self.vae.decoder.mid_block.attentions[0].processor,
|
||||||
XFormersAttnProcessor,
|
(
|
||||||
LoRAXFormersAttnProcessor,
|
AttnProcessor2_0,
|
||||||
LoRAAttnProcessor2_0,
|
XFormersAttnProcessor,
|
||||||
]
|
LoRAXFormersAttnProcessor,
|
||||||
|
LoRAAttnProcessor2_0,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# if xformers or torch_2_0 is used attention block does not need
|
# if xformers or torch_2_0 is used attention block does not need
|
||||||
# to be in float32 which can save lots of memory
|
# to be in float32 which can save lots of memory
|
||||||
if not use_torch_2_0_or_xformers:
|
if use_torch_2_0_or_xformers:
|
||||||
self.vae.post_quant_conv.to(latents.dtype)
|
self.vae.post_quant_conv.to(latents.dtype)
|
||||||
self.vae.decoder.conv_in.to(latents.dtype)
|
self.vae.decoder.conv_in.to(latents.dtype)
|
||||||
self.vae.decoder.mid_block.to(latents.dtype)
|
self.vae.decoder.mid_block.to(latents.dtype)
|
||||||
|
|||||||
@@ -129,6 +129,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
|
|||||||
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
|
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
|
||||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
||||||
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
||||||
|
self.default_sample_size = self.unet.config.sample_size
|
||||||
|
|
||||||
self.watermark = StableDiffusionXLWatermarker()
|
self.watermark = StableDiffusionXLWatermarker()
|
||||||
|
|
||||||
@@ -652,8 +653,8 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
|
|||||||
"not-safe-for-work" (nsfw) content, according to the `safety_checker`.
|
"not-safe-for-work" (nsfw) content, according to the `safety_checker`.
|
||||||
"""
|
"""
|
||||||
# 0. Default height and width to unet
|
# 0. Default height and width to unet
|
||||||
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
height = height or self.default_sample_size * self.vae_scale_factor
|
||||||
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
width = width or self.default_sample_size * self.vae_scale_factor
|
||||||
|
|
||||||
original_size = original_size or (height, width)
|
original_size = original_size or (height, width)
|
||||||
target_size = target_size or (height, width)
|
target_size = target_size or (height, width)
|
||||||
@@ -785,15 +786,18 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
|
|||||||
# make sure the VAE is in float32 mode, as it overflows in float16
|
# make sure the VAE is in float32 mode, as it overflows in float16
|
||||||
self.vae.to(dtype=torch.float32)
|
self.vae.to(dtype=torch.float32)
|
||||||
|
|
||||||
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
|
use_torch_2_0_or_xformers = isinstance(
|
||||||
AttnProcessor2_0,
|
self.vae.decoder.mid_block.attentions[0].processor,
|
||||||
XFormersAttnProcessor,
|
(
|
||||||
LoRAXFormersAttnProcessor,
|
AttnProcessor2_0,
|
||||||
LoRAAttnProcessor2_0,
|
XFormersAttnProcessor,
|
||||||
]
|
LoRAXFormersAttnProcessor,
|
||||||
|
LoRAAttnProcessor2_0,
|
||||||
|
),
|
||||||
|
)
|
||||||
# if xformers or torch_2_0 is used attention block does not need
|
# if xformers or torch_2_0 is used attention block does not need
|
||||||
# to be in float32 which can save lots of memory
|
# to be in float32 which can save lots of memory
|
||||||
if not use_torch_2_0_or_xformers:
|
if use_torch_2_0_or_xformers:
|
||||||
self.vae.post_quant_conv.to(latents.dtype)
|
self.vae.post_quant_conv.to(latents.dtype)
|
||||||
self.vae.decoder.conv_in.to(latents.dtype)
|
self.vae.decoder.conv_in.to(latents.dtype)
|
||||||
self.vae.decoder.mid_block.to(latents.dtype)
|
self.vae.decoder.mid_block.to(latents.dtype)
|
||||||
|
|||||||
@@ -859,15 +859,18 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
|
|||||||
# make sure the VAE is in float32 mode, as it overflows in float16
|
# make sure the VAE is in float32 mode, as it overflows in float16
|
||||||
self.vae.to(dtype=torch.float32)
|
self.vae.to(dtype=torch.float32)
|
||||||
|
|
||||||
use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
|
use_torch_2_0_or_xformers = isinstance(
|
||||||
AttnProcessor2_0,
|
self.vae.decoder.mid_block.attentions[0].processor,
|
||||||
XFormersAttnProcessor,
|
(
|
||||||
LoRAXFormersAttnProcessor,
|
AttnProcessor2_0,
|
||||||
LoRAAttnProcessor2_0,
|
XFormersAttnProcessor,
|
||||||
]
|
LoRAXFormersAttnProcessor,
|
||||||
|
LoRAAttnProcessor2_0,
|
||||||
|
),
|
||||||
|
)
|
||||||
# if xformers or torch_2_0 is used attention block does not need
|
# if xformers or torch_2_0 is used attention block does not need
|
||||||
# to be in float32 which can save lots of memory
|
# to be in float32 which can save lots of memory
|
||||||
if not use_torch_2_0_or_xformers:
|
if use_torch_2_0_or_xformers:
|
||||||
self.vae.post_quant_conv.to(latents.dtype)
|
self.vae.post_quant_conv.to(latents.dtype)
|
||||||
self.vae.decoder.conv_in.to(latents.dtype)
|
self.vae.decoder.conv_in.to(latents.dtype)
|
||||||
self.vae.decoder.mid_block.to(latents.dtype)
|
self.vae.decoder.mid_block.to(latents.dtype)
|
||||||
|
|||||||
@@ -264,7 +264,7 @@ class ConfigTester(unittest.TestCase):
|
|||||||
config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")}
|
config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")}
|
||||||
|
|
||||||
# make sure that default config has all keys in `_use_default_values`
|
# make sure that default config has all keys in `_use_default_values`
|
||||||
assert set(config_dict.keys()) == config.config._use_default_values
|
assert set(config_dict.keys()) == set(config.config._use_default_values)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
config.save_config(tmpdirname)
|
config.save_config(tmpdirname)
|
||||||
|
|||||||
@@ -20,17 +20,20 @@ import unittest
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||||
|
|
||||||
from diffusers import (
|
from diffusers import (
|
||||||
AutoencoderKL,
|
AutoencoderKL,
|
||||||
|
DDIMScheduler,
|
||||||
DPMSolverMultistepScheduler,
|
DPMSolverMultistepScheduler,
|
||||||
LMSDiscreteScheduler,
|
LMSDiscreteScheduler,
|
||||||
PNDMScheduler,
|
PNDMScheduler,
|
||||||
StableDiffusionInpaintPipeline,
|
StableDiffusionInpaintPipeline,
|
||||||
UNet2DConditionModel,
|
UNet2DConditionModel,
|
||||||
)
|
)
|
||||||
|
from diffusers.models.attention_processor import AttnProcessor
|
||||||
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
|
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
|
||||||
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
|
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
|
||||||
from diffusers.utils.testing_utils import (
|
from diffusers.utils.testing_utils import (
|
||||||
@@ -512,6 +515,42 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
|
|||||||
|
|
||||||
assert np.abs(expected_slice - image_slice).max() < 6e-4
|
assert np.abs(expected_slice - image_slice).max() < 6e-4
|
||||||
|
|
||||||
|
def test_download_local(self):
|
||||||
|
filename = hf_hub_download("runwayml/stable-diffusion-inpainting", filename="sd-v1-5-inpainting.ckpt")
|
||||||
|
|
||||||
|
pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16)
|
||||||
|
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||||
|
pipe.to("cuda")
|
||||||
|
|
||||||
|
inputs = self.get_inputs(torch_device)
|
||||||
|
inputs["num_inference_steps"] = 1
|
||||||
|
image_out = pipe(**inputs).images[0]
|
||||||
|
|
||||||
|
assert image_out.shape == (512, 512, 3)
|
||||||
|
|
||||||
|
def test_download_ckpt_diff_format_is_same(self):
|
||||||
|
ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
|
||||||
|
|
||||||
|
pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
|
||||||
|
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||||
|
pipe.unet.set_attn_processor(AttnProcessor())
|
||||||
|
pipe.to("cuda")
|
||||||
|
|
||||||
|
inputs = self.get_inputs(torch_device)
|
||||||
|
inputs["num_inference_steps"] = 5
|
||||||
|
image_ckpt = pipe(**inputs).images[0]
|
||||||
|
|
||||||
|
pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
|
||||||
|
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||||
|
pipe.unet.set_attn_processor(AttnProcessor())
|
||||||
|
pipe.to("cuda")
|
||||||
|
|
||||||
|
inputs = self.get_inputs(torch_device)
|
||||||
|
inputs["num_inference_steps"] = 5
|
||||||
|
image = pipe(**inputs).images[0]
|
||||||
|
|
||||||
|
assert np.max(np.abs(image - image_ckpt)) < 1e-4
|
||||||
|
|
||||||
|
|
||||||
@nightly
|
@nightly
|
||||||
@require_torch_gpu
|
@require_torch_gpu
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import unittest
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from huggingface_hub import hf_hub_download
|
||||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||||
|
|
||||||
from diffusers import (
|
from diffusers import (
|
||||||
@@ -29,6 +30,7 @@ from diffusers import (
|
|||||||
StableDiffusionPipeline,
|
StableDiffusionPipeline,
|
||||||
UNet2DConditionModel,
|
UNet2DConditionModel,
|
||||||
)
|
)
|
||||||
|
from diffusers.models.attention_processor import AttnProcessor
|
||||||
from diffusers.utils import load_numpy, slow, torch_device
|
from diffusers.utils import load_numpy, slow, torch_device
|
||||||
from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
|
from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
|
||||||
|
|
||||||
@@ -426,6 +428,40 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
|
|||||||
assert image.shape == (768, 768, 3)
|
assert image.shape == (768, 768, 3)
|
||||||
assert np.abs(expected_image - image).max() < 7.5e-1
|
assert np.abs(expected_image - image).max() < 7.5e-1
|
||||||
|
|
||||||
|
def test_download_local(self):
|
||||||
|
filename = hf_hub_download("stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.safetensors")
|
||||||
|
|
||||||
|
pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16)
|
||||||
|
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||||
|
pipe.to("cuda")
|
||||||
|
|
||||||
|
image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
|
||||||
|
|
||||||
|
assert image_out.shape == (768, 768, 3)
|
||||||
|
|
||||||
|
def test_download_ckpt_diff_format_is_same(self):
|
||||||
|
single_file_path = (
|
||||||
|
"https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
|
||||||
|
)
|
||||||
|
|
||||||
|
pipe_single = StableDiffusionPipeline.from_single_file(single_file_path)
|
||||||
|
pipe_single.scheduler = DDIMScheduler.from_config(pipe_single.scheduler.config)
|
||||||
|
pipe_single.unet.set_attn_processor(AttnProcessor())
|
||||||
|
pipe_single.to("cuda")
|
||||||
|
|
||||||
|
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||||
|
image_ckpt = pipe_single("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
|
||||||
|
|
||||||
|
pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
|
||||||
|
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||||
|
pipe.unet.set_attn_processor(AttnProcessor())
|
||||||
|
pipe.to("cuda")
|
||||||
|
|
||||||
|
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||||
|
image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
|
||||||
|
|
||||||
|
assert np.max(np.abs(image - image_ckpt)) < 1e-3
|
||||||
|
|
||||||
def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
|
def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
|
||||||
number_of_steps = 0
|
number_of_steps = 0
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
|
import glob
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
@@ -56,6 +57,7 @@ from diffusers import (
|
|||||||
UniPCMultistepScheduler,
|
UniPCMultistepScheduler,
|
||||||
logging,
|
logging,
|
||||||
)
|
)
|
||||||
|
from diffusers.pipelines.pipeline_utils import variant_compatible_siblings
|
||||||
from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
|
from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
|
||||||
from diffusers.utils import (
|
from diffusers.utils import (
|
||||||
CONFIG_NAME,
|
CONFIG_NAME,
|
||||||
@@ -1361,6 +1363,29 @@ class PipelineFastTests(unittest.TestCase):
|
|||||||
assert sd.config.safety_checker != (None, None)
|
assert sd.config.safety_checker != (None, None)
|
||||||
assert sd.config.feature_extractor != (None, None)
|
assert sd.config.feature_extractor != (None, None)
|
||||||
|
|
||||||
|
def test_warning_no_variant_available(self):
|
||||||
|
variant = "fp16"
|
||||||
|
with self.assertWarns(FutureWarning) as warning_context:
|
||||||
|
cached_folder = StableDiffusionPipeline.download(
|
||||||
|
"hf-internal-testing/diffusers-stable-diffusion-tiny-all", variant=variant
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "but no such modeling files are available" in str(warning_context.warning)
|
||||||
|
assert variant in str(warning_context.warning)
|
||||||
|
|
||||||
|
def get_all_filenames(directory):
|
||||||
|
filenames = glob.glob(directory + "/**", recursive=True)
|
||||||
|
filenames = [f for f in filenames if os.path.isfile(f)]
|
||||||
|
return filenames
|
||||||
|
|
||||||
|
filenames = get_all_filenames(str(cached_folder))
|
||||||
|
|
||||||
|
all_model_files, variant_model_files = variant_compatible_siblings(filenames, variant=variant)
|
||||||
|
|
||||||
|
# make sure that none of the model names are variant model names
|
||||||
|
assert len(variant_model_files) == 0
|
||||||
|
assert len(all_model_files) > 0
|
||||||
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@require_torch_gpu
|
@require_torch_gpu
|
||||||
|
|||||||
Reference in New Issue
Block a user