mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-21 11:54:51 +08:00
Compare commits
1 Commits
update-not
...
fast-gpu-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b59c02fb66 |
@@ -158,9 +158,6 @@ def log_validation(
|
|||||||
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
|
f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
|
||||||
f" {args.validation_prompt}."
|
f" {args.validation_prompt}."
|
||||||
)
|
)
|
||||||
if args.enable_vae_tiling:
|
|
||||||
pipeline.vae.enable_tiling(tile_sample_min_height=1024, tile_sample_stride_width=1024)
|
|
||||||
|
|
||||||
pipeline.text_encoder = pipeline.text_encoder.to(torch.bfloat16)
|
pipeline.text_encoder = pipeline.text_encoder.to(torch.bfloat16)
|
||||||
pipeline = pipeline.to(accelerator.device)
|
pipeline = pipeline.to(accelerator.device)
|
||||||
pipeline.set_progress_bar_config(disable=True)
|
pipeline.set_progress_bar_config(disable=True)
|
||||||
@@ -600,7 +597,6 @@ def parse_args(input_args=None):
|
|||||||
help="Whether to offload the VAE and the text encoder to CPU when they are not used.",
|
help="Whether to offload the VAE and the text encoder to CPU when they are not used.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||||
parser.add_argument("--enable_vae_tiling", action="store_true", help="Enabla vae tiling in log validation")
|
|
||||||
|
|
||||||
if input_args is not None:
|
if input_args is not None:
|
||||||
args = parser.parse_args(input_args)
|
args = parser.parse_args(input_args)
|
||||||
|
|||||||
5
setup.py
5
setup.py
@@ -74,9 +74,8 @@ To create the package for PyPI.
|
|||||||
twine upload dist/* -r pypi
|
twine upload dist/* -r pypi
|
||||||
|
|
||||||
10. Prepare the release notes and publish them on GitHub once everything is looking hunky-dory. You can use the following
|
10. Prepare the release notes and publish them on GitHub once everything is looking hunky-dory. You can use the following
|
||||||
Space to fetch all the commits applicable for the release: https://huggingface.co/spaces/sayakpaul/auto-release-notes-diffusers.
|
Space to fetch all the commits applicable for the release: https://huggingface.co/spaces/lysandre/github-release. Repo should
|
||||||
It automatically fetches the correct tag and branch but also provides the option to configure them.
|
be `huggingface/diffusers`. `tag` should be the previous release tag (v0.26.1, for example), and `branch` should be
|
||||||
`tag` should be the previous release tag (v0.26.1, for example), and `branch` should be
|
|
||||||
the latest release branch (v0.27.0-release, for example). It denotes all commits that have happened on branch
|
the latest release branch (v0.27.0-release, for example). It denotes all commits that have happened on branch
|
||||||
v0.27.0-release after the tag v0.26.1 was created.
|
v0.27.0-release after the tag v0.26.1 was created.
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ from huggingface_hub.utils import validate_hf_hub_args
|
|||||||
from ..utils import (
|
from ..utils import (
|
||||||
USE_PEFT_BACKEND,
|
USE_PEFT_BACKEND,
|
||||||
deprecate,
|
deprecate,
|
||||||
get_submodule_by_name,
|
|
||||||
is_peft_available,
|
is_peft_available,
|
||||||
is_peft_version,
|
is_peft_version,
|
||||||
is_torch_version,
|
is_torch_version,
|
||||||
@@ -1982,17 +1981,10 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
|||||||
in_features = state_dict[lora_A_weight_name].shape[1]
|
in_features = state_dict[lora_A_weight_name].shape[1]
|
||||||
out_features = state_dict[lora_B_weight_name].shape[0]
|
out_features = state_dict[lora_B_weight_name].shape[0]
|
||||||
|
|
||||||
# Model maybe loaded with different quantization schemes which may flatten the params.
|
|
||||||
# `bitsandbytes`, for example, flatten the weights when using 4bit. 8bit bnb models
|
|
||||||
# preserve weight shape.
|
|
||||||
module_weight_shape = cls._calculate_module_shape(model=transformer, base_module=module)
|
|
||||||
|
|
||||||
# This means there's no need for an expansion in the params, so we simply skip.
|
# This means there's no need for an expansion in the params, so we simply skip.
|
||||||
if tuple(module_weight_shape) == (out_features, in_features):
|
if tuple(module_weight.shape) == (out_features, in_features):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# TODO (sayakpaul): We still need to consider if the module we're expanding is
|
|
||||||
# quantized and handle it accordingly if that is the case.
|
|
||||||
module_out_features, module_in_features = module_weight.shape
|
module_out_features, module_in_features = module_weight.shape
|
||||||
debug_message = ""
|
debug_message = ""
|
||||||
if in_features > module_in_features:
|
if in_features > module_in_features:
|
||||||
@@ -2088,16 +2080,13 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
|||||||
base_weight_param = transformer_state_dict[base_param_name]
|
base_weight_param = transformer_state_dict[base_param_name]
|
||||||
lora_A_param = lora_state_dict[f"{prefix}{k}.lora_A.weight"]
|
lora_A_param = lora_state_dict[f"{prefix}{k}.lora_A.weight"]
|
||||||
|
|
||||||
# TODO (sayakpaul): Handle the cases when we actually need to expand when using quantization.
|
if base_weight_param.shape[1] > lora_A_param.shape[1]:
|
||||||
base_module_shape = cls._calculate_module_shape(model=transformer, base_weight_param_name=base_param_name)
|
|
||||||
|
|
||||||
if base_module_shape[1] > lora_A_param.shape[1]:
|
|
||||||
shape = (lora_A_param.shape[0], base_weight_param.shape[1])
|
shape = (lora_A_param.shape[0], base_weight_param.shape[1])
|
||||||
expanded_state_dict_weight = torch.zeros(shape, device=base_weight_param.device)
|
expanded_state_dict_weight = torch.zeros(shape, device=base_weight_param.device)
|
||||||
expanded_state_dict_weight[:, : lora_A_param.shape[1]].copy_(lora_A_param)
|
expanded_state_dict_weight[:, : lora_A_param.shape[1]].copy_(lora_A_param)
|
||||||
lora_state_dict[f"{prefix}{k}.lora_A.weight"] = expanded_state_dict_weight
|
lora_state_dict[f"{prefix}{k}.lora_A.weight"] = expanded_state_dict_weight
|
||||||
expanded_module_names.add(k)
|
expanded_module_names.add(k)
|
||||||
elif base_module_shape[1] < lora_A_param.shape[1]:
|
elif base_weight_param.shape[1] < lora_A_param.shape[1]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"This LoRA param ({k}.lora_A.weight) has an incompatible shape {lora_A_param.shape}. Please open an issue to file for a feature request - https://github.com/huggingface/diffusers/issues/new."
|
f"This LoRA param ({k}.lora_A.weight) has an incompatible shape {lora_A_param.shape}. Please open an issue to file for a feature request - https://github.com/huggingface/diffusers/issues/new."
|
||||||
)
|
)
|
||||||
@@ -2109,28 +2098,6 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
|||||||
|
|
||||||
return lora_state_dict
|
return lora_state_dict
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _calculate_module_shape(
|
|
||||||
model: "torch.nn.Module",
|
|
||||||
base_module: "torch.nn.Linear" = None,
|
|
||||||
base_weight_param_name: str = None,
|
|
||||||
) -> "torch.Size":
|
|
||||||
def _get_weight_shape(weight: torch.Tensor):
|
|
||||||
return weight.quant_state.shape if weight.__class__.__name__ == "Params4bit" else weight.shape
|
|
||||||
|
|
||||||
if base_module is not None:
|
|
||||||
return _get_weight_shape(base_module.weight)
|
|
||||||
elif base_weight_param_name is not None:
|
|
||||||
if not base_weight_param_name.endswith(".weight"):
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid `base_weight_param_name` passed as it does not end with '.weight' {base_weight_param_name=}."
|
|
||||||
)
|
|
||||||
module_path = base_weight_param_name.rsplit(".weight", 1)[0]
|
|
||||||
submodule = get_submodule_by_name(model, module_path)
|
|
||||||
return _get_weight_shape(submodule.weight)
|
|
||||||
|
|
||||||
raise ValueError("Either `base_module` or `base_weight_param_name` must be provided.")
|
|
||||||
|
|
||||||
|
|
||||||
# The reason why we subclass from `StableDiffusionLoraLoaderMixin` here is because Amused initially
|
# The reason why we subclass from `StableDiffusionLoraLoaderMixin` here is because Amused initially
|
||||||
# relied on `StableDiffusionLoraLoaderMixin` for its LoRA support.
|
# relied on `StableDiffusionLoraLoaderMixin` for its LoRA support.
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
|
|||||||
force_download = kwargs.pop("force_download", False)
|
force_download = kwargs.pop("force_download", False)
|
||||||
proxies = kwargs.pop("proxies", None)
|
proxies = kwargs.pop("proxies", None)
|
||||||
local_files_only = kwargs.pop("local_files_only", None)
|
local_files_only = kwargs.pop("local_files_only", None)
|
||||||
hf_token = kwargs.pop("hf_token", None)
|
token = kwargs.pop("token", None)
|
||||||
revision = kwargs.pop("revision", None)
|
revision = kwargs.pop("revision", None)
|
||||||
subfolder = kwargs.pop("subfolder", None)
|
subfolder = kwargs.pop("subfolder", None)
|
||||||
weight_name = kwargs.pop("weight_name", None)
|
weight_name = kwargs.pop("weight_name", None)
|
||||||
@@ -73,7 +73,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
|
|||||||
force_download=force_download,
|
force_download=force_download,
|
||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
local_files_only=local_files_only,
|
local_files_only=local_files_only,
|
||||||
token=hf_token,
|
token=token,
|
||||||
revision=revision,
|
revision=revision,
|
||||||
subfolder=subfolder,
|
subfolder=subfolder,
|
||||||
user_agent=user_agent,
|
user_agent=user_agent,
|
||||||
@@ -93,7 +93,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
|
|||||||
force_download=force_download,
|
force_download=force_download,
|
||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
local_files_only=local_files_only,
|
local_files_only=local_files_only,
|
||||||
token=hf_token,
|
token=token,
|
||||||
revision=revision,
|
revision=revision,
|
||||||
subfolder=subfolder,
|
subfolder=subfolder,
|
||||||
user_agent=user_agent,
|
user_agent=user_agent,
|
||||||
@@ -312,7 +312,7 @@ class TextualInversionLoaderMixin:
|
|||||||
local_files_only (`bool`, *optional*, defaults to `False`):
|
local_files_only (`bool`, *optional*, defaults to `False`):
|
||||||
Whether to only load local model weights and configuration files or not. If set to `True`, the model
|
Whether to only load local model weights and configuration files or not. If set to `True`, the model
|
||||||
won't be downloaded from the Hub.
|
won't be downloaded from the Hub.
|
||||||
hf_token (`str` or *bool*, *optional*):
|
token (`str` or *bool*, *optional*):
|
||||||
The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
|
The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
|
||||||
`diffusers-cli login` (stored in `~/.huggingface`) is used.
|
`diffusers-cli login` (stored in `~/.huggingface`) is used.
|
||||||
revision (`str`, *optional*, defaults to `"main"`):
|
revision (`str`, *optional*, defaults to `"main"`):
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from transformers import T5EncoderModel, T5TokenizerFast
|
|||||||
|
|
||||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||||
from ...loaders import Mochi1LoraLoaderMixin
|
from ...loaders import Mochi1LoraLoaderMixin
|
||||||
from ...models.autoencoders import AutoencoderKLMochi
|
from ...models.autoencoders import AutoencoderKL
|
||||||
from ...models.transformers import MochiTransformer3DModel
|
from ...models.transformers import MochiTransformer3DModel
|
||||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||||
from ...utils import (
|
from ...utils import (
|
||||||
@@ -151,8 +151,8 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
|
|||||||
Conditional Transformer architecture to denoise the encoded video latents.
|
Conditional Transformer architecture to denoise the encoded video latents.
|
||||||
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
||||||
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
||||||
vae ([`AutoencoderKLMochi`]):
|
vae ([`AutoencoderKL`]):
|
||||||
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
|
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
||||||
text_encoder ([`T5EncoderModel`]):
|
text_encoder ([`T5EncoderModel`]):
|
||||||
[T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
|
[T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
|
||||||
the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
|
the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
|
||||||
@@ -171,7 +171,7 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
scheduler: FlowMatchEulerDiscreteScheduler,
|
scheduler: FlowMatchEulerDiscreteScheduler,
|
||||||
vae: AutoencoderKLMochi,
|
vae: AutoencoderKL,
|
||||||
text_encoder: T5EncoderModel,
|
text_encoder: T5EncoderModel,
|
||||||
tokenizer: T5TokenizerFast,
|
tokenizer: T5TokenizerFast,
|
||||||
transformer: MochiTransformer3DModel,
|
transformer: MochiTransformer3DModel,
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ import html
|
|||||||
import inspect
|
import inspect
|
||||||
import re
|
import re
|
||||||
import urllib.parse as ul
|
import urllib.parse as ul
|
||||||
import warnings
|
|
||||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -42,7 +41,6 @@ from ..pixart_alpha.pipeline_pixart_alpha import (
|
|||||||
ASPECT_RATIO_1024_BIN,
|
ASPECT_RATIO_1024_BIN,
|
||||||
)
|
)
|
||||||
from ..pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN
|
from ..pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN
|
||||||
from ..sana.pipeline_sana import ASPECT_RATIO_4096_BIN
|
|
||||||
from .pag_utils import PAGMixin
|
from .pag_utils import PAGMixin
|
||||||
|
|
||||||
|
|
||||||
@@ -641,7 +639,7 @@ class SanaPAGPipeline(DiffusionPipeline, PAGMixin):
|
|||||||
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
|
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
|
||||||
output_type: Optional[str] = "pil",
|
output_type: Optional[str] = "pil",
|
||||||
return_dict: bool = True,
|
return_dict: bool = True,
|
||||||
clean_caption: bool = False,
|
clean_caption: bool = True,
|
||||||
use_resolution_binning: bool = True,
|
use_resolution_binning: bool = True,
|
||||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||||
@@ -757,9 +755,7 @@ class SanaPAGPipeline(DiffusionPipeline, PAGMixin):
|
|||||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||||
|
|
||||||
if use_resolution_binning:
|
if use_resolution_binning:
|
||||||
if self.transformer.config.sample_size == 128:
|
if self.transformer.config.sample_size == 64:
|
||||||
aspect_ratio_bin = ASPECT_RATIO_4096_BIN
|
|
||||||
elif self.transformer.config.sample_size == 64:
|
|
||||||
aspect_ratio_bin = ASPECT_RATIO_2048_BIN
|
aspect_ratio_bin = ASPECT_RATIO_2048_BIN
|
||||||
elif self.transformer.config.sample_size == 32:
|
elif self.transformer.config.sample_size == 32:
|
||||||
aspect_ratio_bin = ASPECT_RATIO_1024_BIN
|
aspect_ratio_bin = ASPECT_RATIO_1024_BIN
|
||||||
@@ -916,14 +912,7 @@ class SanaPAGPipeline(DiffusionPipeline, PAGMixin):
|
|||||||
image = latents
|
image = latents
|
||||||
else:
|
else:
|
||||||
latents = latents.to(self.vae.dtype)
|
latents = latents.to(self.vae.dtype)
|
||||||
try:
|
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
|
||||||
except torch.cuda.OutOfMemoryError as e:
|
|
||||||
warnings.warn(
|
|
||||||
f"{e}. \n"
|
|
||||||
f"Try to use VAE tiling for large images. For example: \n"
|
|
||||||
f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)"
|
|
||||||
)
|
|
||||||
if use_resolution_binning:
|
if use_resolution_binning:
|
||||||
image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
|
image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ import html
|
|||||||
import inspect
|
import inspect
|
||||||
import re
|
import re
|
||||||
import urllib.parse as ul
|
import urllib.parse as ul
|
||||||
import warnings
|
|
||||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -954,14 +953,7 @@ class SanaPipeline(DiffusionPipeline, SanaLoraLoaderMixin):
|
|||||||
image = latents
|
image = latents
|
||||||
else:
|
else:
|
||||||
latents = latents.to(self.vae.dtype)
|
latents = latents.to(self.vae.dtype)
|
||||||
try:
|
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
||||||
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
|
|
||||||
except torch.cuda.OutOfMemoryError as e:
|
|
||||||
warnings.warn(
|
|
||||||
f"{e}. \n"
|
|
||||||
f"Try to use VAE tiling for large images. For example: \n"
|
|
||||||
f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)"
|
|
||||||
)
|
|
||||||
if use_resolution_binning:
|
if use_resolution_binning:
|
||||||
image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
|
image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
|
||||||
|
|
||||||
|
|||||||
@@ -13,21 +13,19 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Callable, Dict, List, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from transformers import (
|
from transformers import (
|
||||||
BaseImageProcessor,
|
|
||||||
CLIPTextModelWithProjection,
|
CLIPTextModelWithProjection,
|
||||||
CLIPTokenizer,
|
CLIPTokenizer,
|
||||||
PreTrainedModel,
|
|
||||||
T5EncoderModel,
|
T5EncoderModel,
|
||||||
T5TokenizerFast,
|
T5TokenizerFast,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||||
from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
|
from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin
|
||||||
from ...models.autoencoders import AutoencoderKL
|
from ...models.autoencoders import AutoencoderKL
|
||||||
from ...models.transformers import SD3Transformer2DModel
|
from ...models.transformers import SD3Transformer2DModel
|
||||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||||
@@ -164,7 +162,7 @@ def retrieve_timesteps(
|
|||||||
return timesteps, num_inference_steps
|
return timesteps, num_inference_steps
|
||||||
|
|
||||||
|
|
||||||
class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
|
class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin):
|
||||||
r"""
|
r"""
|
||||||
Args:
|
Args:
|
||||||
transformer ([`SD3Transformer2DModel`]):
|
transformer ([`SD3Transformer2DModel`]):
|
||||||
@@ -196,14 +194,10 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
tokenizer_3 (`T5TokenizerFast`):
|
tokenizer_3 (`T5TokenizerFast`):
|
||||||
Tokenizer of class
|
Tokenizer of class
|
||||||
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
|
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
|
||||||
image_encoder (`PreTrainedModel`, *optional*):
|
|
||||||
Pre-trained Vision Model for IP Adapter.
|
|
||||||
feature_extractor (`BaseImageProcessor`, *optional*):
|
|
||||||
Image processor for IP Adapter.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
|
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
|
||||||
_optional_components = ["image_encoder", "feature_extractor"]
|
_optional_components = []
|
||||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
|
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -217,8 +211,6 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
tokenizer_2: CLIPTokenizer,
|
tokenizer_2: CLIPTokenizer,
|
||||||
text_encoder_3: T5EncoderModel,
|
text_encoder_3: T5EncoderModel,
|
||||||
tokenizer_3: T5TokenizerFast,
|
tokenizer_3: T5TokenizerFast,
|
||||||
image_encoder: PreTrainedModel = None,
|
|
||||||
feature_extractor: BaseImageProcessor = None,
|
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@@ -232,8 +224,6 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
tokenizer_3=tokenizer_3,
|
tokenizer_3=tokenizer_3,
|
||||||
transformer=transformer,
|
transformer=transformer,
|
||||||
scheduler=scheduler,
|
scheduler=scheduler,
|
||||||
image_encoder=image_encoder,
|
|
||||||
feature_extractor=feature_extractor,
|
|
||||||
)
|
)
|
||||||
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
|
||||||
latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
|
latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
|
||||||
@@ -828,10 +818,6 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
def do_classifier_free_guidance(self):
|
def do_classifier_free_guidance(self):
|
||||||
return self._guidance_scale > 1
|
return self._guidance_scale > 1
|
||||||
|
|
||||||
@property
|
|
||||||
def joint_attention_kwargs(self):
|
|
||||||
return self._joint_attention_kwargs
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_timesteps(self):
|
def num_timesteps(self):
|
||||||
return self._num_timesteps
|
return self._num_timesteps
|
||||||
@@ -840,84 +826,6 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
def interrupt(self):
|
def interrupt(self):
|
||||||
return self._interrupt
|
return self._interrupt
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image
|
|
||||||
def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
|
|
||||||
"""Encodes the given image into a feature representation using a pre-trained image encoder.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image (`PipelineImageInput`):
|
|
||||||
Input image to be encoded.
|
|
||||||
device: (`torch.device`):
|
|
||||||
Torch device.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`torch.Tensor`: The encoded image feature representation.
|
|
||||||
"""
|
|
||||||
if not isinstance(image, torch.Tensor):
|
|
||||||
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
|
||||||
|
|
||||||
image = image.to(device=device, dtype=self.dtype)
|
|
||||||
|
|
||||||
return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds
|
|
||||||
def prepare_ip_adapter_image_embeds(
|
|
||||||
self,
|
|
||||||
ip_adapter_image: Optional[PipelineImageInput] = None,
|
|
||||||
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
|
||||||
device: Optional[torch.device] = None,
|
|
||||||
num_images_per_prompt: int = 1,
|
|
||||||
do_classifier_free_guidance: bool = True,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
"""Prepares image embeddings for use in the IP-Adapter.
|
|
||||||
|
|
||||||
Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ip_adapter_image (`PipelineImageInput`, *optional*):
|
|
||||||
The input image to extract features from for IP-Adapter.
|
|
||||||
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
|
||||||
Precomputed image embeddings.
|
|
||||||
device: (`torch.device`, *optional*):
|
|
||||||
Torch device.
|
|
||||||
num_images_per_prompt (`int`, defaults to 1):
|
|
||||||
Number of images that should be generated per prompt.
|
|
||||||
do_classifier_free_guidance (`bool`, defaults to True):
|
|
||||||
Whether to use classifier free guidance or not.
|
|
||||||
"""
|
|
||||||
device = device or self._execution_device
|
|
||||||
|
|
||||||
if ip_adapter_image_embeds is not None:
|
|
||||||
if do_classifier_free_guidance:
|
|
||||||
single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
|
|
||||||
else:
|
|
||||||
single_image_embeds = ip_adapter_image_embeds
|
|
||||||
elif ip_adapter_image is not None:
|
|
||||||
single_image_embeds = self.encode_image(ip_adapter_image, device)
|
|
||||||
if do_classifier_free_guidance:
|
|
||||||
single_negative_image_embeds = torch.zeros_like(single_image_embeds)
|
|
||||||
else:
|
|
||||||
raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
|
|
||||||
|
|
||||||
image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
|
|
||||||
|
|
||||||
if do_classifier_free_guidance:
|
|
||||||
negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
|
|
||||||
image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
|
|
||||||
|
|
||||||
return image_embeds.to(device=device)
|
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload
|
|
||||||
def enable_sequential_cpu_offload(self, *args, **kwargs):
|
|
||||||
if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
|
|
||||||
logger.warning(
|
|
||||||
"`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
|
|
||||||
"`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
|
|
||||||
"`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
|
|
||||||
)
|
|
||||||
|
|
||||||
super().enable_sequential_cpu_offload(*args, **kwargs)
|
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||||
def __call__(
|
def __call__(
|
||||||
@@ -945,11 +853,8 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
||||||
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
||||||
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
|
||||||
ip_adapter_image: Optional[PipelineImageInput] = None,
|
|
||||||
ip_adapter_image_embeds: Optional[torch.Tensor] = None,
|
|
||||||
output_type: Optional[str] = "pil",
|
output_type: Optional[str] = "pil",
|
||||||
return_dict: bool = True,
|
return_dict: bool = True,
|
||||||
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
|
||||||
clip_skip: Optional[int] = None,
|
clip_skip: Optional[int] = None,
|
||||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||||
@@ -985,9 +890,9 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
mask_image_latent (`torch.Tensor`, `List[torch.Tensor]`):
|
mask_image_latent (`torch.Tensor`, `List[torch.Tensor]`):
|
||||||
`Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask
|
`Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask
|
||||||
latents tensor will ge generated by `mask_image`.
|
latents tensor will ge generated by `mask_image`.
|
||||||
height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
|
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||||
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||||
width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
|
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||||
The width in pixels of the generated image. This is set to 1024 by default for the best results.
|
The width in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||||
padding_mask_crop (`int`, *optional*, defaults to `None`):
|
padding_mask_crop (`int`, *optional*, defaults to `None`):
|
||||||
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
|
||||||
@@ -1048,22 +953,12 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
||||||
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
||||||
input argument.
|
input argument.
|
||||||
ip_adapter_image (`PipelineImageInput`, *optional*):
|
|
||||||
Optional image input to work with IP Adapters.
|
|
||||||
ip_adapter_image_embeds (`torch.Tensor`, *optional*):
|
|
||||||
Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
|
|
||||||
emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
|
|
||||||
`True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
|
|
||||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||||
The output format of the generate image. Choose between
|
The output format of the generate image. Choose between
|
||||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||||
return_dict (`bool`, *optional*, defaults to `True`):
|
return_dict (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
|
Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
|
||||||
a plain tuple.
|
a plain tuple.
|
||||||
joint_attention_kwargs (`dict`, *optional*):
|
|
||||||
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
|
||||||
`self.processor` in
|
|
||||||
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
|
||||||
callback_on_step_end (`Callable`, *optional*):
|
callback_on_step_end (`Callable`, *optional*):
|
||||||
A function that calls at the end of each denoising steps during the inference. The function is called
|
A function that calls at the end of each denoising steps during the inference. The function is called
|
||||||
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
|
||||||
@@ -1111,7 +1006,6 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
|
|
||||||
self._guidance_scale = guidance_scale
|
self._guidance_scale = guidance_scale
|
||||||
self._clip_skip = clip_skip
|
self._clip_skip = clip_skip
|
||||||
self._joint_attention_kwargs = joint_attention_kwargs
|
|
||||||
self._interrupt = False
|
self._interrupt = False
|
||||||
|
|
||||||
# 2. Define call parameters
|
# 2. Define call parameters
|
||||||
@@ -1266,22 +1160,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
f"The transformer {self.transformer.__class__} should have 16 input channels or 33 input channels, not {self.transformer.config.in_channels}."
|
f"The transformer {self.transformer.__class__} should have 16 input channels or 33 input channels, not {self.transformer.config.in_channels}."
|
||||||
)
|
)
|
||||||
|
|
||||||
# 7. Prepare image embeddings
|
# 7. Denoising loop
|
||||||
if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
|
|
||||||
ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
|
|
||||||
ip_adapter_image,
|
|
||||||
ip_adapter_image_embeds,
|
|
||||||
device,
|
|
||||||
batch_size * num_images_per_prompt,
|
|
||||||
self.do_classifier_free_guidance,
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.joint_attention_kwargs is None:
|
|
||||||
self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
|
|
||||||
else:
|
|
||||||
self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
|
|
||||||
|
|
||||||
# 8. Denoising loop
|
|
||||||
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
|
||||||
self._num_timesteps = len(timesteps)
|
self._num_timesteps = len(timesteps)
|
||||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||||
@@ -1302,7 +1181,6 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
|||||||
timestep=timestep,
|
timestep=timestep,
|
||||||
encoder_hidden_states=prompt_embeds,
|
encoder_hidden_states=prompt_embeds,
|
||||||
pooled_projections=pooled_prompt_embeds,
|
pooled_projections=pooled_prompt_embeds,
|
||||||
joint_attention_kwargs=self.joint_attention_kwargs,
|
|
||||||
return_dict=False,
|
return_dict=False,
|
||||||
)[0]
|
)[0]
|
||||||
|
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ from .import_utils import (
|
|||||||
is_xformers_available,
|
is_xformers_available,
|
||||||
requires_backends,
|
requires_backends,
|
||||||
)
|
)
|
||||||
from .loading_utils import get_module_from_name, get_submodule_by_name, load_image, load_video
|
from .loading_utils import get_module_from_name, load_image, load_video
|
||||||
from .logging import get_logger
|
from .logging import get_logger
|
||||||
from .outputs import BaseOutput
|
from .outputs import BaseOutput
|
||||||
from .peft_utils import (
|
from .peft_utils import (
|
||||||
|
|||||||
@@ -148,15 +148,3 @@ def get_module_from_name(module, tensor_name: str) -> Tuple[Any, str]:
|
|||||||
module = new_module
|
module = new_module
|
||||||
tensor_name = splits[-1]
|
tensor_name = splits[-1]
|
||||||
return module, tensor_name
|
return module, tensor_name
|
||||||
|
|
||||||
|
|
||||||
def get_submodule_by_name(root_module, module_path: str):
|
|
||||||
current = root_module
|
|
||||||
parts = module_path.split(".")
|
|
||||||
for part in parts:
|
|
||||||
if part.isdigit():
|
|
||||||
idx = int(part)
|
|
||||||
current = current[idx] # e.g., for nn.ModuleList or nn.Sequential
|
|
||||||
else:
|
|
||||||
current = getattr(current, part)
|
|
||||||
return current
|
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ class CogVideoXTransformerTests(ModelTesterMixin, unittest.TestCase):
|
|||||||
model_class = CogVideoXTransformer3DModel
|
model_class = CogVideoXTransformer3DModel
|
||||||
main_input_name = "hidden_states"
|
main_input_name = "hidden_states"
|
||||||
uses_custom_attn_processor = True
|
uses_custom_attn_processor = True
|
||||||
model_split_percents = [0.7, 0.7, 0.8]
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dummy_input(self):
|
def dummy_input(self):
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ class CogView3PlusTransformerTests(ModelTesterMixin, unittest.TestCase):
|
|||||||
model_class = CogView3PlusTransformer2DModel
|
model_class = CogView3PlusTransformer2DModel
|
||||||
main_input_name = "hidden_states"
|
main_input_name = "hidden_states"
|
||||||
uses_custom_attn_processor = True
|
uses_custom_attn_processor = True
|
||||||
model_split_percents = [0.7, 0.6, 0.6]
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dummy_input(self):
|
def dummy_input(self):
|
||||||
|
|||||||
@@ -106,8 +106,6 @@ class StableDiffusion3InpaintPipelineFastTests(PipelineLatentTesterMixin, unitte
|
|||||||
"tokenizer_3": tokenizer_3,
|
"tokenizer_3": tokenizer_3,
|
||||||
"transformer": transformer,
|
"transformer": transformer,
|
||||||
"vae": vae,
|
"vae": vae,
|
||||||
"image_encoder": None,
|
|
||||||
"feature_extractor": None,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_dummy_inputs(self, device, seed=0):
|
def get_dummy_inputs(self, device, seed=0):
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ import unittest
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
import safetensors.torch
|
import safetensors.torch
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel
|
from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel
|
||||||
from diffusers.utils import is_accelerate_version, logging
|
from diffusers.utils import is_accelerate_version, logging
|
||||||
@@ -569,27 +568,6 @@ class SlowBnb4BitFluxTests(Base4bitTests):
|
|||||||
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
|
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
|
||||||
self.assertTrue(max_diff < 1e-3)
|
self.assertTrue(max_diff < 1e-3)
|
||||||
|
|
||||||
def test_lora_loading(self):
|
|
||||||
self.pipeline_4bit.load_lora_weights(
|
|
||||||
hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"), adapter_name="hyper-sd"
|
|
||||||
)
|
|
||||||
self.pipeline_4bit.set_adapters("hyper-sd", adapter_weights=0.125)
|
|
||||||
|
|
||||||
output = self.pipeline_4bit(
|
|
||||||
prompt=self.prompt,
|
|
||||||
height=256,
|
|
||||||
width=256,
|
|
||||||
max_sequence_length=64,
|
|
||||||
output_type="np",
|
|
||||||
num_inference_steps=8,
|
|
||||||
generator=torch.Generator().manual_seed(42),
|
|
||||||
).images
|
|
||||||
out_slice = output[0, -3:, -3:, -1].flatten()
|
|
||||||
expected_slice = np.array([0.5347, 0.5342, 0.5283, 0.5093, 0.4988, 0.5093, 0.5044, 0.5015, 0.4946])
|
|
||||||
|
|
||||||
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
|
|
||||||
self.assertTrue(max_diff < 1e-3)
|
|
||||||
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
class BaseBnb4BitSerializationTests(Base4bitTests):
|
class BaseBnb4BitSerializationTests(Base4bitTests):
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ import unittest
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel, logging
|
from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel, logging
|
||||||
from diffusers.utils import is_accelerate_version
|
from diffusers.utils import is_accelerate_version
|
||||||
@@ -31,7 +30,6 @@ from diffusers.utils.testing_utils import (
|
|||||||
numpy_cosine_similarity_distance,
|
numpy_cosine_similarity_distance,
|
||||||
require_accelerate,
|
require_accelerate,
|
||||||
require_bitsandbytes_version_greater,
|
require_bitsandbytes_version_greater,
|
||||||
require_peft_version_greater,
|
|
||||||
require_torch,
|
require_torch,
|
||||||
require_torch_gpu,
|
require_torch_gpu,
|
||||||
require_transformers_version_greater,
|
require_transformers_version_greater,
|
||||||
@@ -511,29 +509,6 @@ class SlowBnb8bitFluxTests(Base8bitTests):
|
|||||||
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
|
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
|
||||||
self.assertTrue(max_diff < 1e-3)
|
self.assertTrue(max_diff < 1e-3)
|
||||||
|
|
||||||
@require_peft_version_greater("0.14.0")
|
|
||||||
def test_lora_loading(self):
|
|
||||||
self.pipeline_8bit.load_lora_weights(
|
|
||||||
hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"), adapter_name="hyper-sd"
|
|
||||||
)
|
|
||||||
self.pipeline_8bit.set_adapters("hyper-sd", adapter_weights=0.125)
|
|
||||||
|
|
||||||
output = self.pipeline_8bit(
|
|
||||||
prompt=self.prompt,
|
|
||||||
height=256,
|
|
||||||
width=256,
|
|
||||||
max_sequence_length=64,
|
|
||||||
output_type="np",
|
|
||||||
num_inference_steps=8,
|
|
||||||
generator=torch.manual_seed(42),
|
|
||||||
).images
|
|
||||||
out_slice = output[0, -3:, -3:, -1].flatten()
|
|
||||||
|
|
||||||
expected_slice = np.array([0.3916, 0.3916, 0.3887, 0.4243, 0.4155, 0.4233, 0.4570, 0.4531, 0.4248])
|
|
||||||
|
|
||||||
max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
|
|
||||||
self.assertTrue(max_diff < 1e-3)
|
|
||||||
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
class BaseBnb8bitSerializationTests(Base8bitTests):
|
class BaseBnb8bitSerializationTests(Base8bitTests):
|
||||||
|
|||||||
Reference in New Issue
Block a user