mirror of
https://github.com/huggingface/diffusers.git
synced 2026-03-05 00:00:50 +08:00
Compare commits
2 Commits
not-trust-
...
remove-non
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5e87c38b29 | ||
|
|
a80b19218b |
@@ -5472,6 +5472,10 @@ class Flux2LoraLoaderMixin(LoraBaseMixin):
|
||||
logger.warning(warn_msg)
|
||||
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
|
||||
|
||||
is_peft_format = any(k.startswith("base_model.model.") for k in state_dict)
|
||||
if is_peft_format:
|
||||
state_dict = {k.replace("base_model.model.", "diffusion_model."): v for k, v in state_dict.items()}
|
||||
|
||||
is_ai_toolkit = any(k.startswith("diffusion_model.") for k in state_dict)
|
||||
if is_ai_toolkit:
|
||||
state_dict = _convert_non_diffusers_flux2_lora_to_diffusers(state_dict)
|
||||
|
||||
@@ -34,12 +34,7 @@ from ..utils import (
|
||||
get_logger,
|
||||
is_aiter_available,
|
||||
is_aiter_version,
|
||||
is_flash_attn_3_available,
|
||||
is_flash_attn_available,
|
||||
is_flash_attn_version,
|
||||
is_kernels_available,
|
||||
is_sageattention_available,
|
||||
is_sageattention_version,
|
||||
is_torch_npu_available,
|
||||
is_torch_version,
|
||||
is_torch_xla_available,
|
||||
@@ -55,62 +50,23 @@ from ._modeling_parallel import gather_size_by_comm
|
||||
if TYPE_CHECKING:
|
||||
from ._modeling_parallel import ParallelConfig
|
||||
|
||||
_REQUIRED_FLASH_VERSION = "2.6.3"
|
||||
_REQUIRED_AITER_VERSION = "0.1.5"
|
||||
_REQUIRED_SAGE_VERSION = "2.1.1"
|
||||
_REQUIRED_FLEX_VERSION = "2.5.0"
|
||||
_REQUIRED_XLA_VERSION = "2.2"
|
||||
_REQUIRED_XFORMERS_VERSION = "0.0.29"
|
||||
|
||||
_CAN_USE_FLASH_ATTN = is_flash_attn_available() and is_flash_attn_version(">=", _REQUIRED_FLASH_VERSION)
|
||||
_CAN_USE_FLASH_ATTN_3 = is_flash_attn_3_available()
|
||||
_CAN_USE_AITER_ATTN = is_aiter_available() and is_aiter_version(">=", _REQUIRED_AITER_VERSION)
|
||||
_CAN_USE_SAGE_ATTN = is_sageattention_available() and is_sageattention_version(">=", _REQUIRED_SAGE_VERSION)
|
||||
_CAN_USE_FLEX_ATTN = is_torch_version(">=", _REQUIRED_FLEX_VERSION)
|
||||
_CAN_USE_NPU_ATTN = is_torch_npu_available()
|
||||
_CAN_USE_XLA_ATTN = is_torch_xla_available() and is_torch_xla_version(">=", _REQUIRED_XLA_VERSION)
|
||||
_CAN_USE_XFORMERS_ATTN = is_xformers_available() and is_xformers_version(">=", _REQUIRED_XFORMERS_VERSION)
|
||||
|
||||
|
||||
if _CAN_USE_FLASH_ATTN:
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward
|
||||
else:
|
||||
flash_attn_func = None
|
||||
flash_attn_varlen_func = None
|
||||
_wrapped_flash_attn_backward = None
|
||||
_wrapped_flash_attn_forward = None
|
||||
|
||||
|
||||
if _CAN_USE_FLASH_ATTN_3:
|
||||
from flash_attn_interface import flash_attn_func as flash_attn_3_func
|
||||
from flash_attn_interface import flash_attn_varlen_func as flash_attn_3_varlen_func
|
||||
else:
|
||||
flash_attn_3_func = None
|
||||
flash_attn_3_varlen_func = None
|
||||
|
||||
if _CAN_USE_AITER_ATTN:
|
||||
from aiter import flash_attn_func as aiter_flash_attn_func
|
||||
else:
|
||||
aiter_flash_attn_func = None
|
||||
|
||||
if _CAN_USE_SAGE_ATTN:
|
||||
from sageattention import (
|
||||
sageattn,
|
||||
sageattn_qk_int8_pv_fp8_cuda,
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90,
|
||||
sageattn_qk_int8_pv_fp16_cuda,
|
||||
sageattn_qk_int8_pv_fp16_triton,
|
||||
sageattn_varlen,
|
||||
)
|
||||
else:
|
||||
sageattn = None
|
||||
sageattn_qk_int8_pv_fp16_cuda = None
|
||||
sageattn_qk_int8_pv_fp16_triton = None
|
||||
sageattn_qk_int8_pv_fp8_cuda = None
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90 = None
|
||||
sageattn_varlen = None
|
||||
|
||||
|
||||
if _CAN_USE_FLEX_ATTN:
|
||||
# We cannot import the flex_attention function from the package directly because it is expected (from the
|
||||
@@ -136,27 +92,6 @@ if _CAN_USE_XFORMERS_ATTN:
|
||||
else:
|
||||
xops = None
|
||||
|
||||
# Version guard for PyTorch compatibility - custom_op was added in PyTorch 2.4
|
||||
if torch.__version__ >= "2.4.0":
|
||||
_custom_op = torch.library.custom_op
|
||||
_register_fake = torch.library.register_fake
|
||||
else:
|
||||
|
||||
def custom_op_no_op(name, fn=None, /, *, mutates_args, device_types=None, schema=None):
|
||||
def wrap(func):
|
||||
return func
|
||||
|
||||
return wrap if fn is None else fn
|
||||
|
||||
def register_fake_no_op(op, fn=None, /, *, lib=None, _stacklevel=1):
|
||||
def wrap(func):
|
||||
return func
|
||||
|
||||
return wrap if fn is None else fn
|
||||
|
||||
_custom_op = custom_op_no_op
|
||||
_register_fake = register_fake_no_op
|
||||
|
||||
|
||||
logger = get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
@@ -304,11 +239,11 @@ def attention_backend(backend: str | AttentionBackendName = AttentionBackendName
|
||||
"""
|
||||
Context manager to set the active attention backend.
|
||||
"""
|
||||
if backend not in _AttentionBackendRegistry._backends:
|
||||
raise ValueError(f"Backend {backend} is not registered.")
|
||||
|
||||
backend = AttentionBackendName(backend)
|
||||
_check_attention_backend_requirements(backend)
|
||||
|
||||
if backend not in _AttentionBackendRegistry._backends:
|
||||
raise ValueError(f"Backend {backend} is not registered.")
|
||||
_maybe_download_kernel_for_backend(backend)
|
||||
|
||||
old_backend = _AttentionBackendRegistry._active_backend
|
||||
@@ -442,16 +377,32 @@ def _check_shape(
|
||||
|
||||
def _check_attention_backend_requirements(backend: AttentionBackendName) -> None:
|
||||
if backend in [AttentionBackendName.FLASH, AttentionBackendName.FLASH_VARLEN]:
|
||||
if not _CAN_USE_FLASH_ATTN:
|
||||
raise RuntimeError(
|
||||
f"Flash Attention backend '{backend.value}' is not usable because of missing package or the version is too old. Please install `flash-attn>={_REQUIRED_FLASH_VERSION}`."
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"The '{backend.value}' attention backend has been removed. "
|
||||
f"Please use 'flash_hub' or 'flash_varlen_hub' instead, which load the flash-attn kernel from the Hub. "
|
||||
f"Install the required package with `pip install kernels`."
|
||||
)
|
||||
|
||||
elif backend in [AttentionBackendName._FLASH_3, AttentionBackendName._FLASH_VARLEN_3]:
|
||||
if not _CAN_USE_FLASH_ATTN_3:
|
||||
raise RuntimeError(
|
||||
f"Flash Attention 3 backend '{backend.value}' is not usable because of missing package or the version is too old. Please build FA3 beta release from source."
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"The '{backend.value}' attention backend has been removed. "
|
||||
f"Please use '_flash_3_hub' or '_flash_3_varlen_hub' instead, which load the flash-attn-3 kernel from the Hub. "
|
||||
f"Install the required package with `pip install kernels`."
|
||||
)
|
||||
|
||||
elif backend in [
|
||||
AttentionBackendName.SAGE,
|
||||
AttentionBackendName.SAGE_VARLEN,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP16_CUDA,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP16_TRITON,
|
||||
]:
|
||||
raise RuntimeError(
|
||||
f"The '{backend.value}' attention backend has been removed. "
|
||||
f"Please use 'sage_hub' instead, which loads the SageAttention kernel from the Hub. "
|
||||
f"Install the required package with `pip install kernels`."
|
||||
)
|
||||
|
||||
elif backend in [
|
||||
AttentionBackendName.FLASH_HUB,
|
||||
@@ -471,19 +422,6 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None
|
||||
f"Aiter Attention backend '{backend.value}' is not usable because of missing package or the version is too old. Please install `aiter>={_REQUIRED_AITER_VERSION}`."
|
||||
)
|
||||
|
||||
elif backend in [
|
||||
AttentionBackendName.SAGE,
|
||||
AttentionBackendName.SAGE_VARLEN,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP16_CUDA,
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP16_TRITON,
|
||||
]:
|
||||
if not _CAN_USE_SAGE_ATTN:
|
||||
raise RuntimeError(
|
||||
f"Sage Attention backend '{backend.value}' is not usable because of missing package or the version is too old. Please install `sageattention>={_REQUIRED_SAGE_VERSION}`."
|
||||
)
|
||||
|
||||
elif backend == AttentionBackendName.FLEX:
|
||||
if not _CAN_USE_FLEX_ATTN:
|
||||
raise RuntimeError(
|
||||
@@ -652,78 +590,6 @@ def _maybe_download_kernel_for_backend(backend: AttentionBackendName) -> None:
|
||||
raise
|
||||
|
||||
|
||||
# ===== torch op registrations =====
|
||||
# Registrations are required for fullgraph tracing compatibility
|
||||
# TODO: this is only required because the beta release FA3 does not have it. There is a PR adding
|
||||
# this but it was never merged: https://github.com/Dao-AILab/flash-attention/pull/1590
|
||||
@_custom_op("_diffusers_flash_attn_3::_flash_attn_forward", mutates_args=(), device_types="cuda")
|
||||
def _wrapped_flash_attn_3(
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
softmax_scale: float | None = None,
|
||||
causal: bool = False,
|
||||
qv: torch.Tensor | None = None,
|
||||
q_descale: torch.Tensor | None = None,
|
||||
k_descale: torch.Tensor | None = None,
|
||||
v_descale: torch.Tensor | None = None,
|
||||
attention_chunk: int = 0,
|
||||
softcap: float = 0.0,
|
||||
num_splits: int = 1,
|
||||
pack_gqa: bool | None = None,
|
||||
deterministic: bool = False,
|
||||
sm_margin: int = 0,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Hardcoded for now because pytorch does not support tuple/int type hints
|
||||
window_size = (-1, -1)
|
||||
out, lse, *_ = flash_attn_3_func(
|
||||
q=q,
|
||||
k=k,
|
||||
v=v,
|
||||
softmax_scale=softmax_scale,
|
||||
causal=causal,
|
||||
qv=qv,
|
||||
q_descale=q_descale,
|
||||
k_descale=k_descale,
|
||||
v_descale=v_descale,
|
||||
window_size=window_size,
|
||||
attention_chunk=attention_chunk,
|
||||
softcap=softcap,
|
||||
num_splits=num_splits,
|
||||
pack_gqa=pack_gqa,
|
||||
deterministic=deterministic,
|
||||
sm_margin=sm_margin,
|
||||
)
|
||||
lse = lse.permute(0, 2, 1)
|
||||
return out, lse
|
||||
|
||||
|
||||
@_register_fake("_diffusers_flash_attn_3::_flash_attn_forward")
|
||||
def _(
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
softmax_scale: float | None = None,
|
||||
causal: bool = False,
|
||||
qv: torch.Tensor | None = None,
|
||||
q_descale: torch.Tensor | None = None,
|
||||
k_descale: torch.Tensor | None = None,
|
||||
v_descale: torch.Tensor | None = None,
|
||||
attention_chunk: int = 0,
|
||||
softcap: float = 0.0,
|
||||
num_splits: int = 1,
|
||||
pack_gqa: bool | None = None,
|
||||
deterministic: bool = False,
|
||||
sm_margin: int = 0,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
window_size = (-1, -1) # noqa: F841
|
||||
# A lot of the parameters here are not yet used in any way within diffusers.
|
||||
# We can safely ignore for now and keep the fake op shape propagation simple.
|
||||
batch_size, seq_len, num_heads, head_dim = q.shape
|
||||
lse_shape = (batch_size, seq_len, num_heads)
|
||||
return torch.empty_like(q), q.new_empty(lse_shape)
|
||||
|
||||
|
||||
# ===== Helper functions to use attention backends with templated CP autograd functions =====
|
||||
|
||||
|
||||
@@ -995,107 +861,6 @@ def _native_flash_attention_backward_op(
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
# Adapted from: https://github.com/Dao-AILab/flash-attention/blob/fd2fc9d85c8e54e5c20436465bca709bc1a6c5a1/flash_attn/flash_attn_interface.py#L807
|
||||
def _flash_attention_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for flash-attn 2.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for flash-attn 2.")
|
||||
|
||||
# Hardcoded for now
|
||||
window_size = (-1, -1)
|
||||
softcap = 0.0
|
||||
alibi_slopes = None
|
||||
deterministic = False
|
||||
grad_enabled = any(x.requires_grad for x in (query, key, value))
|
||||
|
||||
if scale is None:
|
||||
scale = query.shape[-1] ** (-0.5)
|
||||
|
||||
# flash-attn only returns LSE if dropout_p > 0. So, we need to workaround.
|
||||
if grad_enabled or (_parallel_config is not None and _parallel_config.context_parallel_config._world_size > 1):
|
||||
dropout_p = dropout_p if dropout_p > 0 else 1e-30
|
||||
|
||||
with torch.set_grad_enabled(grad_enabled):
|
||||
out, lse, S_dmask, rng_state = _wrapped_flash_attn_forward(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
dropout_p,
|
||||
scale,
|
||||
is_causal,
|
||||
window_size[0],
|
||||
window_size[1],
|
||||
softcap,
|
||||
alibi_slopes,
|
||||
return_lse,
|
||||
)
|
||||
lse = lse.permute(0, 2, 1)
|
||||
|
||||
if _save_ctx:
|
||||
ctx.save_for_backward(query, key, value, out, lse, rng_state)
|
||||
ctx.dropout_p = dropout_p
|
||||
ctx.scale = scale
|
||||
ctx.is_causal = is_causal
|
||||
ctx.window_size = window_size
|
||||
ctx.softcap = softcap
|
||||
ctx.alibi_slopes = alibi_slopes
|
||||
ctx.deterministic = deterministic
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _flash_attention_backward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
grad_out: torch.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
query, key, value, out, lse, rng_state = ctx.saved_tensors
|
||||
grad_query, grad_key, grad_value = torch.empty_like(query), torch.empty_like(key), torch.empty_like(value)
|
||||
|
||||
lse_d = _wrapped_flash_attn_backward( # noqa: F841
|
||||
grad_out,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
out,
|
||||
lse,
|
||||
grad_query,
|
||||
grad_key,
|
||||
grad_value,
|
||||
ctx.dropout_p,
|
||||
ctx.scale,
|
||||
ctx.is_causal,
|
||||
ctx.window_size[0],
|
||||
ctx.window_size[1],
|
||||
ctx.softcap,
|
||||
ctx.alibi_slopes,
|
||||
ctx.deterministic,
|
||||
rng_state,
|
||||
)
|
||||
|
||||
# Head dimension may have been padded
|
||||
grad_query = grad_query[..., : grad_out.shape[-1]]
|
||||
grad_key = grad_key[..., : grad_out.shape[-1]]
|
||||
grad_value = grad_value[..., : grad_out.shape[-1]]
|
||||
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _flash_attention_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
@@ -1327,44 +1092,6 @@ def _flash_attention_3_hub_backward_op(
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _sage_attention_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for Sage attention.")
|
||||
if dropout_p > 0.0:
|
||||
raise ValueError("`dropout_p` is not yet supported for Sage attention.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for Sage attention.")
|
||||
|
||||
out = sageattn(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
lse = None
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
lse = lse.permute(0, 2, 1)
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _sage_attention_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
@@ -2205,59 +1932,6 @@ def _templated_context_parallel_attention(
|
||||
# ===== Attention backends =====
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.FLASH,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=True,
|
||||
)
|
||||
def _flash_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
lse = None
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for flash-attn 2.")
|
||||
|
||||
if _parallel_config is None:
|
||||
out = flash_attn_func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
dropout_p=dropout_p,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
else:
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
dropout_p,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_lse,
|
||||
forward_op=_flash_attention_forward_op,
|
||||
backward_op=_flash_attention_backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse = out
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.FLASH_HUB,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
@@ -2369,88 +2043,6 @@ def _flash_varlen_attention_hub(
|
||||
return out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.FLASH_VARLEN,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
)
|
||||
def _flash_varlen_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
scale: float | None = None,
|
||||
is_causal: bool = False,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
batch_size, seq_len_q, _, _ = query.shape
|
||||
_, seq_len_kv, _, _ = key.shape
|
||||
|
||||
if attn_mask is not None:
|
||||
attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv)
|
||||
|
||||
(_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = (
|
||||
_prepare_for_flash_attn_or_sage_varlen(
|
||||
batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device
|
||||
)
|
||||
)
|
||||
|
||||
key_valid, value_valid = [], []
|
||||
for b in range(batch_size):
|
||||
valid_len = seqlens_k[b]
|
||||
key_valid.append(key[b, :valid_len])
|
||||
value_valid.append(value[b, :valid_len])
|
||||
|
||||
query_packed = query.flatten(0, 1)
|
||||
key_packed = torch.cat(key_valid, dim=0)
|
||||
value_packed = torch.cat(value_valid, dim=0)
|
||||
|
||||
out = flash_attn_varlen_func(
|
||||
q=query_packed,
|
||||
k=key_packed,
|
||||
v=value_packed,
|
||||
cu_seqlens_q=cu_seqlens_q,
|
||||
cu_seqlens_k=cu_seqlens_k,
|
||||
max_seqlen_q=max_seqlen_q,
|
||||
max_seqlen_k=max_seqlen_k,
|
||||
dropout_p=dropout_p,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
out = out.unflatten(0, (batch_size, -1))
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._FLASH_3,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
)
|
||||
def _flash_attention_3(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
scale: float | None = None,
|
||||
is_causal: bool = False,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for flash-attn 3.")
|
||||
|
||||
out, lse = _wrapped_flash_attn_3(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
)
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._FLASH_3_HUB,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
@@ -2587,58 +2179,6 @@ def _flash_attention_3_varlen_hub(
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._FLASH_VARLEN_3,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
)
|
||||
def _flash_varlen_attention_3(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
scale: float | None = None,
|
||||
is_causal: bool = False,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
batch_size, seq_len_q, _, _ = query.shape
|
||||
_, seq_len_kv, _, _ = key.shape
|
||||
|
||||
if attn_mask is not None:
|
||||
attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv)
|
||||
|
||||
(_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = (
|
||||
_prepare_for_flash_attn_or_sage_varlen(
|
||||
batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device
|
||||
)
|
||||
)
|
||||
|
||||
key_valid, value_valid = [], []
|
||||
for b in range(batch_size):
|
||||
valid_len = seqlens_k[b]
|
||||
key_valid.append(key[b, :valid_len])
|
||||
value_valid.append(value[b, :valid_len])
|
||||
|
||||
query_packed = query.flatten(0, 1)
|
||||
key_packed = torch.cat(key_valid, dim=0)
|
||||
value_packed = torch.cat(value_valid, dim=0)
|
||||
|
||||
out, lse, *_ = flash_attn_3_varlen_func(
|
||||
q=query_packed,
|
||||
k=key_packed,
|
||||
v=value_packed,
|
||||
cu_seqlens_q=cu_seqlens_q,
|
||||
cu_seqlens_k=cu_seqlens_k,
|
||||
max_seqlen_q=max_seqlen_q,
|
||||
max_seqlen_k=max_seqlen_k,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
)
|
||||
out = out.unflatten(0, (batch_size, -1))
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.AITER,
|
||||
constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
@@ -3108,57 +2648,6 @@ def _native_xla_attention(
|
||||
return out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.SAGE,
|
||||
constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=True,
|
||||
)
|
||||
def _sage_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for sage attention")
|
||||
lse = None
|
||||
if _parallel_config is None:
|
||||
out = sageattn(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
else:
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
0.0,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_lse,
|
||||
forward_op=_sage_attention_forward_op,
|
||||
backward_op=_sage_attention_backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse = out
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.SAGE_HUB,
|
||||
constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
@@ -3211,169 +2700,6 @@ def _sage_attention_hub(
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.SAGE_VARLEN,
|
||||
constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
)
|
||||
def _sage_varlen_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if return_lse:
|
||||
raise ValueError("Sage varlen backend does not support setting `return_lse=True`.")
|
||||
|
||||
batch_size, seq_len_q, _, _ = query.shape
|
||||
_, seq_len_kv, _, _ = key.shape
|
||||
|
||||
if attn_mask is not None:
|
||||
attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv)
|
||||
|
||||
(_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = (
|
||||
_prepare_for_flash_attn_or_sage_varlen(
|
||||
batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device
|
||||
)
|
||||
)
|
||||
|
||||
key_valid, value_valid = [], []
|
||||
for b in range(batch_size):
|
||||
valid_len = seqlens_k[b]
|
||||
key_valid.append(key[b, :valid_len])
|
||||
value_valid.append(value[b, :valid_len])
|
||||
|
||||
query_packed = query.flatten(0, 1)
|
||||
key_packed = torch.cat(key_valid, dim=0)
|
||||
value_packed = torch.cat(value_valid, dim=0)
|
||||
|
||||
out = sageattn_varlen(
|
||||
q=query_packed,
|
||||
k=key_packed,
|
||||
v=value_packed,
|
||||
cu_seqlens_q=cu_seqlens_q,
|
||||
cu_seqlens_k=cu_seqlens_k,
|
||||
max_seqlen_q=max_seqlen_q,
|
||||
max_seqlen_k=max_seqlen_k,
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
)
|
||||
out = out.unflatten(0, (batch_size, -1))
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA,
|
||||
constraints=[_check_device_cuda_atleast_smXY(9, 0), _check_shape],
|
||||
)
|
||||
def _sage_qk_int8_pv_fp8_cuda_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for sage attention")
|
||||
return sageattn_qk_int8_pv_fp8_cuda(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
|
||||
constraints=[_check_device_cuda_atleast_smXY(9, 0), _check_shape],
|
||||
)
|
||||
def _sage_qk_int8_pv_fp8_cuda_sm90_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for sage attention")
|
||||
return sageattn_qk_int8_pv_fp8_cuda_sm90(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP16_CUDA,
|
||||
constraints=[_check_device_cuda_atleast_smXY(8, 0), _check_shape],
|
||||
)
|
||||
def _sage_qk_int8_pv_fp16_cuda_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for sage attention")
|
||||
return sageattn_qk_int8_pv_fp16_cuda(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._SAGE_QK_INT8_PV_FP16_TRITON,
|
||||
constraints=[_check_device_cuda_atleast_smXY(8, 0), _check_shape],
|
||||
)
|
||||
def _sage_qk_int8_pv_fp16_triton_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for sage attention")
|
||||
return sageattn_qk_int8_pv_fp16_triton(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.XFORMERS,
|
||||
constraints=[_check_attn_mask_or_causal, _check_device, _check_shape],
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
import importlib
|
||||
import inspect
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
@@ -29,16 +28,10 @@ from tqdm.auto import tqdm
|
||||
from typing_extensions import Self
|
||||
|
||||
from ..configuration_utils import ConfigMixin, FrozenDict
|
||||
from ..pipelines.pipeline_loading_utils import (
|
||||
LOADABLE_CLASSES,
|
||||
_fetch_class_library_tuple,
|
||||
_unwrap_model,
|
||||
simple_get_class_obj,
|
||||
)
|
||||
from ..pipelines.pipeline_loading_utils import _fetch_class_library_tuple, simple_get_class_obj
|
||||
from ..utils import PushToHubMixin, is_accelerate_available, logging
|
||||
from ..utils.dynamic_modules_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||
from ..utils.hub_utils import load_or_create_model_card, populate_model_card
|
||||
from ..utils.torch_utils import is_compiled_module
|
||||
from .components_manager import ComponentsManager
|
||||
from .modular_pipeline_utils import (
|
||||
MODULAR_MODEL_CARD_TEMPLATE,
|
||||
@@ -1640,14 +1633,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
blocks_class_name = self.default_blocks_name
|
||||
if blocks_class_name is not None:
|
||||
diffusers_module = importlib.import_module("diffusers")
|
||||
blocks_class = getattr(diffusers_module, blocks_class_name, None)
|
||||
# If the blocks_class is not found or is a base class (e.g. SequentialPipelineBlocks saved by from_blocks_dict) with empty block_classes
|
||||
# fall back to default_blocks_name
|
||||
if blocks_class is None or not blocks_class.block_classes:
|
||||
blocks_class_name = self.default_blocks_name
|
||||
blocks_class = getattr(diffusers_module, blocks_class_name)
|
||||
|
||||
if blocks_class is not None:
|
||||
blocks_class = getattr(diffusers_module, blocks_class_name)
|
||||
blocks = blocks_class()
|
||||
else:
|
||||
logger.warning(f"`blocks` is `None`, no default blocks class found for {self.__class__.__name__}")
|
||||
@@ -1833,124 +1819,29 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
)
|
||||
return pipeline
|
||||
|
||||
def save_pretrained(
|
||||
self,
|
||||
save_directory: str | os.PathLike,
|
||||
safe_serialization: bool = True,
|
||||
variant: str | None = None,
|
||||
max_shard_size: int | str | None = None,
|
||||
push_to_hub: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
|
||||
"""
|
||||
Save the pipeline and all its components to a directory, so that it can be re-loaded using the
|
||||
[`~ModularPipeline.from_pretrained`] class method.
|
||||
Save the pipeline to a directory. It does not save components, you need to save them separately.
|
||||
|
||||
Args:
|
||||
save_directory (`str` or `os.PathLike`):
|
||||
Directory to save the pipeline to. Will be created if it doesn't exist.
|
||||
safe_serialization (`bool`, *optional*, defaults to `True`):
|
||||
Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
|
||||
variant (`str`, *optional*):
|
||||
If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
|
||||
max_shard_size (`int` or `str`, defaults to `None`):
|
||||
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
|
||||
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
|
||||
If expressed as an integer, the unit is bytes.
|
||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||
Whether to push the pipeline to the Hugging Face model hub after saving it.
|
||||
**kwargs: Additional keyword arguments:
|
||||
- `overwrite_modular_index` (`bool`, *optional*, defaults to `False`):
|
||||
When saving a Modular Pipeline, its components in `modular_model_index.json` may reference repos
|
||||
different from the destination repo. Setting this to `True` updates all component references in
|
||||
`modular_model_index.json` so they point to the repo specified by `repo_id`.
|
||||
- `repo_id` (`str`, *optional*):
|
||||
The repository ID to push the pipeline to. Defaults to the last component of `save_directory`.
|
||||
- `commit_message` (`str`, *optional*):
|
||||
Commit message for the push to hub operation.
|
||||
- `private` (`bool`, *optional*):
|
||||
Whether the repository should be private.
|
||||
- `create_pr` (`bool`, *optional*, defaults to `False`):
|
||||
Whether to create a pull request instead of pushing directly.
|
||||
- `token` (`str`, *optional*):
|
||||
The Hugging Face token to use for authentication.
|
||||
Path to the directory where the pipeline will be saved.
|
||||
push_to_hub (`bool`, optional):
|
||||
Whether to push the pipeline to the huggingface hub.
|
||||
**kwargs: Additional arguments passed to `save_config()` method
|
||||
"""
|
||||
overwrite_modular_index = kwargs.pop("overwrite_modular_index", False)
|
||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||
|
||||
if push_to_hub:
|
||||
commit_message = kwargs.pop("commit_message", None)
|
||||
private = kwargs.pop("private", None)
|
||||
create_pr = kwargs.pop("create_pr", False)
|
||||
token = kwargs.pop("token", None)
|
||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||
repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
|
||||
|
||||
for component_name, component_spec in self._component_specs.items():
|
||||
if component_spec.default_creation_method != "from_pretrained":
|
||||
continue
|
||||
|
||||
component = getattr(self, component_name, None)
|
||||
if component is None:
|
||||
continue
|
||||
|
||||
model_cls = component.__class__
|
||||
if is_compiled_module(component):
|
||||
component = _unwrap_model(component)
|
||||
model_cls = component.__class__
|
||||
|
||||
save_method_name = None
|
||||
for library_name, library_classes in LOADABLE_CLASSES.items():
|
||||
if library_name in sys.modules:
|
||||
library = importlib.import_module(library_name)
|
||||
else:
|
||||
logger.info(
|
||||
f"{library_name} is not installed. Cannot save {component_name} as {library_classes} from {library_name}"
|
||||
)
|
||||
continue
|
||||
|
||||
for base_class, save_load_methods in library_classes.items():
|
||||
class_candidate = getattr(library, base_class, None)
|
||||
if class_candidate is not None and issubclass(model_cls, class_candidate):
|
||||
save_method_name = save_load_methods[0]
|
||||
break
|
||||
if save_method_name is not None:
|
||||
break
|
||||
|
||||
if save_method_name is None:
|
||||
logger.warning(f"self.{component_name}={component} of type {type(component)} cannot be saved.")
|
||||
continue
|
||||
|
||||
save_method = getattr(component, save_method_name)
|
||||
save_method_signature = inspect.signature(save_method)
|
||||
save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
|
||||
save_method_accept_variant = "variant" in save_method_signature.parameters
|
||||
save_method_accept_max_shard_size = "max_shard_size" in save_method_signature.parameters
|
||||
|
||||
save_kwargs = {}
|
||||
if save_method_accept_safe:
|
||||
save_kwargs["safe_serialization"] = safe_serialization
|
||||
if save_method_accept_variant:
|
||||
save_kwargs["variant"] = variant
|
||||
if save_method_accept_max_shard_size and max_shard_size is not None:
|
||||
save_kwargs["max_shard_size"] = max_shard_size
|
||||
|
||||
component_save_path = os.path.join(save_directory, component_name)
|
||||
save_method(component_save_path, **save_kwargs)
|
||||
|
||||
if component_name not in self.config:
|
||||
continue
|
||||
|
||||
has_no_load_id = not hasattr(component, "_diffusers_load_id") or component._diffusers_load_id == "null"
|
||||
if overwrite_modular_index or has_no_load_id:
|
||||
library, class_name, component_spec_dict = self.config[component_name]
|
||||
component_spec_dict["pretrained_model_name_or_path"] = repo_id if push_to_hub else save_directory
|
||||
component_spec_dict["subfolder"] = component_name
|
||||
self.register_to_config(**{component_name: (library, class_name, component_spec_dict)})
|
||||
|
||||
self.save_config(save_directory=save_directory)
|
||||
|
||||
if push_to_hub:
|
||||
# Generate modular pipeline card content
|
||||
card_content = generate_modular_model_card_content(self.blocks)
|
||||
|
||||
# Create a new empty model card and eventually tag it
|
||||
model_card = load_or_create_model_card(
|
||||
repo_id,
|
||||
token=token,
|
||||
@@ -1959,8 +1850,13 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
is_modular=True,
|
||||
)
|
||||
model_card = populate_model_card(model_card, tags=card_content["tags"])
|
||||
|
||||
model_card.save(os.path.join(save_directory, "README.md"))
|
||||
|
||||
# YiYi TODO: maybe order the json file to make it more readable: configs first, then components
|
||||
self.save_config(save_directory=save_directory)
|
||||
|
||||
if push_to_hub:
|
||||
self._upload_folder(
|
||||
save_directory,
|
||||
repo_id,
|
||||
@@ -2228,9 +2124,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Components loaded with `AutoModel.from_pretrained()` or `ComponentSpec.load()` will have
|
||||
loading specs preserved for serialization. Custom or locally loaded components without Hub references will
|
||||
have their `modular_model_index.json` entries updated automatically during `save_pretrained()`.
|
||||
- Components with trained weights should be loaded with `AutoModel.from_pretrained()` or
|
||||
`ComponentSpec.load()` so that loading specs are preserved for serialization.
|
||||
- ConfigMixin objects without weights (e.g., schedulers, guiders) can be passed directly.
|
||||
"""
|
||||
|
||||
@@ -2255,6 +2150,10 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
elif current_component_spec.default_creation_method == "from_pretrained" and not (
|
||||
hasattr(component, "_diffusers_load_id") and component._diffusers_load_id is not None
|
||||
):
|
||||
logger.warning(
|
||||
f"ModularPipeline.update_components: {name} has no valid _diffusers_load_id. "
|
||||
f"This will result in empty loading spec, use ComponentSpec.load() for proper specs"
|
||||
)
|
||||
new_component_spec = ComponentSpec(name=name, type_hint=type(component))
|
||||
else:
|
||||
new_component_spec = ComponentSpec.from_component(name, component)
|
||||
|
||||
@@ -311,12 +311,6 @@ class ComponentSpec:
|
||||
f"`type_hint` is required when loading a single file model but is missing for component: {self.name}"
|
||||
)
|
||||
|
||||
# `torch_dtype` is not an accepted parameter for tokenizers and processors.
|
||||
# As a result, it gets stored in `init_kwargs`, which are written to the config
|
||||
# during save. This causes JSON serialization to fail when saving the component.
|
||||
if self.type_hint is not None and not issubclass(self.type_hint, torch.nn.Module):
|
||||
kwargs.pop("torch_dtype", None)
|
||||
|
||||
if self.type_hint is None:
|
||||
try:
|
||||
from diffusers import AutoModel
|
||||
@@ -334,12 +328,6 @@ class ComponentSpec:
|
||||
else getattr(self.type_hint, "from_pretrained")
|
||||
)
|
||||
|
||||
# `torch_dtype` is not an accepted parameter for tokenizers and processors.
|
||||
# As a result, it gets stored in `init_kwargs`, which are written to the config
|
||||
# during save. This causes JSON serialization to fail when saving the component.
|
||||
if not issubclass(self.type_hint, torch.nn.Module):
|
||||
kwargs.pop("torch_dtype", None)
|
||||
|
||||
try:
|
||||
component = load_method(pretrained_model_name_or_path, **load_kwargs, **kwargs)
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import gc
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Callable
|
||||
|
||||
@@ -687,18 +686,6 @@ class TestLoadComponentsSkipBehavior:
|
||||
assert pipe.unet is not None
|
||||
assert getattr(pipe, "vae", None) is None
|
||||
|
||||
def test_load_components_selective_loading_incremental(self):
|
||||
"""Loading a subset of components should not affect already-loaded components."""
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
|
||||
pipe.load_components(names="unet", torch_dtype=torch.float32)
|
||||
pipe.load_components(names="text_encoder", torch_dtype=torch.float32)
|
||||
|
||||
assert hasattr(pipe, "unet")
|
||||
assert pipe.unet is not None
|
||||
assert hasattr(pipe, "text_encoder")
|
||||
assert pipe.text_encoder is not None
|
||||
|
||||
def test_load_components_skips_invalid_pretrained_path(self):
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
|
||||
@@ -712,133 +699,3 @@ class TestLoadComponentsSkipBehavior:
|
||||
|
||||
# Verify test_component was not loaded
|
||||
assert not hasattr(pipe, "test_component") or pipe.test_component is None
|
||||
|
||||
|
||||
class TestCustomModelSavePretrained:
|
||||
def test_save_pretrained_updates_index_for_local_model(self, tmp_path):
|
||||
"""When a component without _diffusers_load_id (custom/local model) is saved,
|
||||
modular_model_index.json should point to the save directory."""
|
||||
import json
|
||||
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
pipe.unet._diffusers_load_id = "null"
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir)
|
||||
|
||||
with open(os.path.join(save_dir, "modular_model_index.json")) as f:
|
||||
index = json.load(f)
|
||||
|
||||
_library, _cls, unet_spec = index["unet"]
|
||||
assert unet_spec["pretrained_model_name_or_path"] == save_dir
|
||||
assert unet_spec["subfolder"] == "unet"
|
||||
|
||||
_library, _cls, vae_spec = index["vae"]
|
||||
assert vae_spec["pretrained_model_name_or_path"] == "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
|
||||
|
||||
def test_save_pretrained_roundtrip_with_local_model(self, tmp_path):
|
||||
"""A pipeline with a custom/local model should be saveable and re-loadable with identical outputs."""
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
pipe.unet._diffusers_load_id = "null"
|
||||
|
||||
original_state_dict = pipe.unet.state_dict()
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir)
|
||||
|
||||
loaded_pipe = ModularPipeline.from_pretrained(save_dir)
|
||||
loaded_pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
assert loaded_pipe.unet is not None
|
||||
assert loaded_pipe.unet.__class__.__name__ == pipe.unet.__class__.__name__
|
||||
|
||||
loaded_state_dict = loaded_pipe.unet.state_dict()
|
||||
assert set(original_state_dict.keys()) == set(loaded_state_dict.keys())
|
||||
for key in original_state_dict:
|
||||
assert torch.equal(original_state_dict[key], loaded_state_dict[key]), f"Mismatch in {key}"
|
||||
|
||||
def test_save_pretrained_updates_index_for_model_with_no_load_id(self, tmp_path):
|
||||
"""When a component without _diffusers_load_id (custom/local model) is saved,
|
||||
modular_model_index.json should point to the save directory."""
|
||||
import json
|
||||
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-xl-pipe", subfolder="unet"
|
||||
)
|
||||
assert not hasattr(unet, "_diffusers_load_id")
|
||||
|
||||
pipe.update_components(unet=unet)
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir)
|
||||
|
||||
with open(os.path.join(save_dir, "modular_model_index.json")) as f:
|
||||
index = json.load(f)
|
||||
|
||||
_library, _cls, unet_spec = index["unet"]
|
||||
assert unet_spec["pretrained_model_name_or_path"] == save_dir
|
||||
assert unet_spec["subfolder"] == "unet"
|
||||
|
||||
_library, _cls, vae_spec = index["vae"]
|
||||
assert vae_spec["pretrained_model_name_or_path"] == "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
|
||||
|
||||
def test_save_pretrained_overwrite_modular_index(self, tmp_path):
|
||||
"""With overwrite_modular_index=True, all component references should point to the save directory."""
|
||||
import json
|
||||
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir, overwrite_modular_index=True)
|
||||
|
||||
with open(os.path.join(save_dir, "modular_model_index.json")) as f:
|
||||
index = json.load(f)
|
||||
|
||||
for component_name in ["unet", "vae", "text_encoder", "text_encoder_2"]:
|
||||
if component_name not in index:
|
||||
continue
|
||||
_library, _cls, spec = index[component_name]
|
||||
assert spec["pretrained_model_name_or_path"] == save_dir, (
|
||||
f"{component_name} should point to save dir but got {spec['pretrained_model_name_or_path']}"
|
||||
)
|
||||
assert spec["subfolder"] == component_name
|
||||
|
||||
loaded_pipe = ModularPipeline.from_pretrained(save_dir)
|
||||
loaded_pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
assert loaded_pipe.unet is not None
|
||||
assert loaded_pipe.vae is not None
|
||||
|
||||
|
||||
class TestModularPipelineInitFallback:
|
||||
"""Test that ModularPipeline.__init__ falls back to default_blocks_name when
|
||||
_blocks_class_name is a base class (e.g. SequentialPipelineBlocks saved by from_blocks_dict)."""
|
||||
|
||||
def test_init_fallback_when_blocks_class_name_is_base_class(self, tmp_path):
|
||||
# 1. Load pipeline and get a workflow (returns a base SequentialPipelineBlocks)
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
t2i_blocks = pipe.blocks.get_workflow("text2image")
|
||||
assert t2i_blocks.__class__.__name__ == "SequentialPipelineBlocks"
|
||||
|
||||
# 2. Use init_pipeline to create a new pipeline from the workflow blocks
|
||||
t2i_pipe = t2i_blocks.init_pipeline("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
|
||||
# 3. Save and reload — the saved config will have _blocks_class_name="SequentialPipelineBlocks"
|
||||
save_dir = str(tmp_path / "pipeline")
|
||||
t2i_pipe.save_pretrained(save_dir)
|
||||
loaded_pipe = ModularPipeline.from_pretrained(save_dir)
|
||||
|
||||
# 4. Verify it fell back to default_blocks_name and has correct blocks
|
||||
assert loaded_pipe.__class__.__name__ == pipe.__class__.__name__
|
||||
assert loaded_pipe._blocks.__class__.__name__ == pipe._blocks.__class__.__name__
|
||||
assert len(loaded_pipe._blocks.sub_blocks) == len(pipe._blocks.sub_blocks)
|
||||
|
||||
Reference in New Issue
Block a user