Compare commits

..

2 Commits

Author SHA1 Message Date
DN6
36c0d78b8b fix copies 2026-02-16 13:10:19 +05:30
DN6
66f6f8b926 remove k-diffusion 2026-02-16 12:50:25 +05:30
27 changed files with 720 additions and 531 deletions

View File

@@ -199,6 +199,11 @@ jobs:
- name: Install dependencies
run: |
# Install pkgs which depend on setuptools<81 for pkg_resources first with no build isolation
uv pip install pip==25.2 setuptools==80.10.2
uv pip install --no-build-isolation k-diffusion==0.0.12
uv pip install --upgrade pip setuptools
# Install the rest as normal
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git

View File

@@ -126,6 +126,11 @@ jobs:
- name: Install dependencies
run: |
# Install pkgs which depend on setuptools<81 for pkg_resources first with no build isolation
uv pip install pip==25.2 setuptools==80.10.2
uv pip install --no-build-isolation k-diffusion==0.0.12
uv pip install --upgrade pip setuptools
# Install the rest as normal
uv pip install -e ".[quality]"
uv pip install peft@git+https://github.com/huggingface/peft.git
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git

View File

@@ -41,7 +41,7 @@ jobs:
shell: arch -arch arm64 bash {0}
run: |
${CONDA_RUN} python -m pip install --upgrade pip uv
${CONDA_RUN} python -m uv pip install -e ".[quality]"
${CONDA_RUN} python -m uv pip install -e ".[quality,test]"
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
${CONDA_RUN} python -m uv pip install transformers --upgrade

View File

@@ -29,7 +29,7 @@ Qwen-Image comes in the following variants:
| Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) |
> [!TIP]
> See the [Caching](../../optimization/cache) guide to speed up inference by storing and reusing intermediate outputs.
> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
## LoRA for faster inference
@@ -190,12 +190,6 @@ For detailed benchmark scripts and results, see [this gist](https://gist.github.
- all
- __call__
## QwenImageLayeredPipeline
[[autodoc]] QwenImageLayeredPipeline
- all
- __call__
## QwenImagePipelineOutput
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput

View File

@@ -101,7 +101,6 @@ _deps = [
"datasets",
"filelock",
"flax>=0.4.1",
"ftfy",
"hf-doc-builder>=0.3.0",
"httpx<1.0.0",
"huggingface-hub>=0.34.0,<2.0",
@@ -222,14 +221,12 @@ extras["docs"] = deps_list("hf-doc-builder")
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft", "timm")
extras["test"] = deps_list(
"compel",
"ftfy",
"GitPython",
"datasets",
"Jinja2",
"invisible-watermark",
"librosa",
"parameterized",
"protobuf",
"pytest",
"pytest-timeout",
"pytest-xdist",
@@ -238,7 +235,6 @@ extras["test"] = deps_list(
"sentencepiece",
"scipy",
"tiktoken",
"torchsde",
"torchvision",
"transformers",
"phonemizer",

View File

@@ -8,7 +8,6 @@ deps = {
"datasets": "datasets",
"filelock": "filelock",
"flax": "flax>=0.4.1",
"ftfy": "ftfy",
"hf-doc-builder": "hf-doc-builder>=0.3.0",
"httpx": "httpx<1.0.0",
"huggingface-hub": "huggingface-hub>=0.34.0,<2.0",

View File

@@ -5472,10 +5472,6 @@ class Flux2LoraLoaderMixin(LoraBaseMixin):
logger.warning(warn_msg)
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
is_peft_format = any(k.startswith("base_model.model.") for k in state_dict)
if is_peft_format:
state_dict = {k.replace("base_model.model.", "diffusion_model."): v for k, v in state_dict.items()}
is_ai_toolkit = any(k.startswith("diffusion_model.") for k in state_dict)
if is_ai_toolkit:
state_dict = _convert_non_diffusers_flux2_lora_to_diffusers(state_dict)

File diff suppressed because it is too large Load Diff

View File

@@ -424,7 +424,7 @@ class Flux2SingleTransformerBlock(nn.Module):
self,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor | None,
temb_mod: torch.Tensor,
temb_mod_params: tuple[torch.Tensor, torch.Tensor, torch.Tensor],
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
joint_attention_kwargs: dict[str, Any] | None = None,
split_hidden_states: bool = False,
@@ -436,7 +436,7 @@ class Flux2SingleTransformerBlock(nn.Module):
text_seq_len = encoder_hidden_states.shape[1]
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
mod_shift, mod_scale, mod_gate = Flux2Modulation.split(temb_mod, 1)[0]
mod_shift, mod_scale, mod_gate = temb_mod_params
norm_hidden_states = self.norm(hidden_states)
norm_hidden_states = (1 + mod_scale) * norm_hidden_states + mod_shift
@@ -498,18 +498,16 @@ class Flux2TransformerBlock(nn.Module):
self,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
temb_mod_img: torch.Tensor,
temb_mod_txt: torch.Tensor,
temb_mod_params_img: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
temb_mod_params_txt: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
joint_attention_kwargs: dict[str, Any] | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
joint_attention_kwargs = joint_attention_kwargs or {}
# Modulation parameters shape: [1, 1, self.dim]
(shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = Flux2Modulation.split(temb_mod_img, 2)
(c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = Flux2Modulation.split(
temb_mod_txt, 2
)
(shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = temb_mod_params_img
(c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = temb_mod_params_txt
# Img stream
norm_hidden_states = self.norm1(hidden_states)
@@ -629,19 +627,15 @@ class Flux2Modulation(nn.Module):
self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias)
self.act_fn = nn.SiLU()
def forward(self, temb: torch.Tensor) -> torch.Tensor:
def forward(self, temb: torch.Tensor) -> tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
mod = self.act_fn(temb)
mod = self.linear(mod)
return mod
@staticmethod
# split inside the transformer blocks, to avoid passing tuples into checkpoints https://github.com/huggingface/diffusers/issues/12776
def split(mod: torch.Tensor, mod_param_sets: int) -> tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
if mod.ndim == 2:
mod = mod.unsqueeze(1)
mod_params = torch.chunk(mod, 3 * mod_param_sets, dim=-1)
mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1)
# Return tuple of 3-tuples of modulation params shift/scale/gate
return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(mod_param_sets))
return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets))
class Flux2Transformer2DModel(
@@ -830,7 +824,7 @@ class Flux2Transformer2DModel(
double_stream_mod_img = self.double_stream_modulation_img(temb)
double_stream_mod_txt = self.double_stream_modulation_txt(temb)
single_stream_mod = self.single_stream_modulation(temb)
single_stream_mod = self.single_stream_modulation(temb)[0]
# 2. Input projection for image (hidden_states) and conditioning text (encoder_hidden_states)
hidden_states = self.x_embedder(hidden_states)
@@ -867,8 +861,8 @@ class Flux2Transformer2DModel(
encoder_hidden_states, hidden_states = block(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
temb_mod_img=double_stream_mod_img,
temb_mod_txt=double_stream_mod_txt,
temb_mod_params_img=double_stream_mod_img,
temb_mod_params_txt=double_stream_mod_txt,
image_rotary_emb=concat_rotary_emb,
joint_attention_kwargs=joint_attention_kwargs,
)
@@ -890,7 +884,7 @@ class Flux2Transformer2DModel(
hidden_states = block(
hidden_states=hidden_states,
encoder_hidden_states=None,
temb_mod=single_stream_mod,
temb_mod_params=single_stream_mod,
image_rotary_emb=concat_rotary_emb,
joint_attention_kwargs=joint_attention_kwargs,
)

View File

@@ -164,11 +164,7 @@ def compute_text_seq_len_from_mask(
position_ids = torch.arange(text_seq_len, device=encoder_hidden_states.device, dtype=torch.long)
active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
has_active = encoder_hidden_states_mask.any(dim=1)
per_sample_len = torch.where(
has_active,
active_positions.max(dim=1).values + 1,
torch.as_tensor(text_seq_len, device=encoder_hidden_states.device),
)
per_sample_len = torch.where(has_active, active_positions.max(dim=1).values + 1, torch.as_tensor(text_seq_len))
return text_seq_len, per_sample_len, encoder_hidden_states_mask

View File

@@ -112,7 +112,7 @@ LIBRARIES = []
for library in LOADABLE_CLASSES:
LIBRARIES.append(library)
SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]
SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()]
logger = logging.get_logger(__name__)
@@ -468,7 +468,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
pipeline_is_sequentially_offloaded = any(
module_is_sequentially_offloaded(module) for _, module in self.components.items()
)
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
if is_pipeline_device_mapped:
raise ValueError(
"It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
@@ -1187,7 +1188,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
"""
self._maybe_raise_error_if_group_offload_active(raise_error=True)
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
if is_pipeline_device_mapped:
raise ValueError(
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
@@ -1311,7 +1312,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
self.remove_all_hooks()
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
if is_pipeline_device_mapped:
raise ValueError(
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
@@ -2227,21 +2228,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
return True
return False
def _is_pipeline_device_mapped(self):
# We support passing `device_map="cuda"`, for example. This is helpful, in case
# users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
# in limited VRAM environments because quantized models often initialize directly on the accelerator.
device_map = self.hf_device_map
is_device_type_map = False
if isinstance(device_map, str):
try:
torch.device(device_map)
is_device_type_map = True
except RuntimeError:
pass
return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
class StableDiffusionMixin:
r"""

View File

@@ -18,6 +18,7 @@ import re
import urllib.parse as ul
from typing import Callable
import ftfy
import torch
from transformers import (
AutoTokenizer,
@@ -33,13 +34,13 @@ from diffusers.models.transformers.transformer_prx import PRXTransformer2DModel
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.prx.pipeline_output import PRXPipelineOutput
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import is_ftfy_available, logging, replace_example_docstring
from diffusers.utils import (
logging,
replace_example_docstring,
)
from diffusers.utils.torch_utils import randn_tensor
if is_ftfy_available():
import ftfy
DEFAULT_RESOLUTION = 512
ASPECT_RATIO_256_BIN = {

View File

@@ -516,9 +516,6 @@ def dequantize_gguf_tensor(tensor):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
# Conver to plain tensor to avoid unnecessary __torch_function__ overhead.
tensor = tensor.as_tensor()
tensor = tensor.view(torch.uint8)
shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
@@ -528,7 +525,7 @@ def dequantize_gguf_tensor(tensor):
dequant = dequant_fn(blocks, block_size, type_size)
dequant = dequant.reshape(shape)
return dequant
return dequant.as_tensor()
class GGUFParameter(torch.nn.Parameter):

View File

@@ -14,7 +14,6 @@
import math
from dataclasses import dataclass
from typing import Literal
import numpy as np
import torch
@@ -42,7 +41,7 @@ class FlowMatchLCMSchedulerOutput(BaseOutput):
denoising loop.
"""
prev_sample: torch.Tensor
prev_sample: torch.FloatTensor
class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
@@ -80,11 +79,11 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
use_beta_sigmas (`bool`, defaults to False):
Whether to use beta sigmas for step sizes in the noise schedule during sampling.
time_shift_type (`str`, defaults to "exponential"):
The type of dynamic resolution-dependent timestep shifting to apply.
scale_factors (`list[float]`, *optional*, defaults to `None`):
The type of dynamic resolution-dependent timestep shifting to apply. Either "exponential" or "linear".
scale_factors ('list', defaults to None)
It defines how to scale the latents at which predictions are made.
upscale_mode (`str`, *optional*, defaults to "bicubic"):
Upscaling method, applied if scale-wise generation is considered.
upscale_mode ('str', defaults to 'bicubic')
Upscaling method, applied if scale-wise generation is considered
"""
_compatibles = []
@@ -102,33 +101,16 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
max_image_seq_len: int = 4096,
invert_sigmas: bool = False,
shift_terminal: float | None = None,
use_karras_sigmas: bool | None = False,
use_exponential_sigmas: bool | None = False,
use_beta_sigmas: bool | None = False,
time_shift_type: Literal["exponential", "linear"] = "exponential",
use_karras_sigmas: bool = False,
use_exponential_sigmas: bool = False,
use_beta_sigmas: bool = False,
time_shift_type: str = "exponential",
scale_factors: list[float] | None = None,
upscale_mode: Literal[
"nearest",
"linear",
"bilinear",
"bicubic",
"trilinear",
"area",
"nearest-exact",
] = "bicubic",
upscale_mode: str = "bicubic",
):
if self.config.use_beta_sigmas and not is_scipy_available():
raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
if (
sum(
[
self.config.use_beta_sigmas,
self.config.use_exponential_sigmas,
self.config.use_karras_sigmas,
]
)
> 1
):
if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
raise ValueError(
"Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
)
@@ -180,7 +162,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
return self._begin_index
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
def set_begin_index(self, begin_index: int = 0) -> None:
def set_begin_index(self, begin_index: int = 0):
"""
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
@@ -190,18 +172,18 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
"""
self._begin_index = begin_index
def set_shift(self, shift: float) -> None:
def set_shift(self, shift: float):
self._shift = shift
def set_scale_factors(self, scale_factors: list[float], upscale_mode: str) -> None:
def set_scale_factors(self, scale_factors: list, upscale_mode):
"""
Sets scale factors for a scale-wise generation regime.
Args:
scale_factors (`list[float]`):
The scale factors for each step.
scale_factors (`list`):
The scale factors for each step
upscale_mode (`str`):
Upscaling method.
Upscaling method
"""
self._scale_factors = scale_factors
self._upscale_mode = upscale_mode
@@ -256,18 +238,16 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
return sample
def _sigma_to_t(self, sigma: float | torch.FloatTensor) -> float | torch.FloatTensor:
def _sigma_to_t(self, sigma):
return sigma * self.config.num_train_timesteps
def time_shift(
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
) -> float | np.ndarray | torch.Tensor:
def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
if self.config.time_shift_type == "exponential":
return self._time_shift_exponential(mu, sigma, t)
elif self.config.time_shift_type == "linear":
return self._time_shift_linear(mu, sigma, t)
def stretch_shift_to_terminal(self, t: np.ndarray | torch.Tensor) -> np.ndarray | torch.Tensor:
def stretch_shift_to_terminal(self, t: torch.Tensor) -> torch.Tensor:
r"""
Stretches and shifts the timestep schedule to ensure it terminates at the configured `shift_terminal` config
value.
@@ -276,13 +256,12 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
https://github.com/Lightricks/LTX-Video/blob/a01a171f8fe3d99dce2728d60a73fecf4d4238ae/ltx_video/schedulers/rf.py#L51
Args:
t (`torch.Tensor` or `np.ndarray`):
A tensor or numpy array of timesteps to be stretched and shifted.
t (`torch.Tensor`):
A tensor of timesteps to be stretched and shifted.
Returns:
`torch.Tensor` or `np.ndarray`:
A tensor or numpy array of adjusted timesteps such that the final value equals
`self.config.shift_terminal`.
`torch.Tensor`:
A tensor of adjusted timesteps such that the final value equals `self.config.shift_terminal`.
"""
one_minus_z = 1 - t
scale_factor = one_minus_z[-1] / (1 - self.config.shift_terminal)
@@ -291,12 +270,12 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
def set_timesteps(
self,
num_inference_steps: int | None = None,
device: str | torch.device | None = None,
num_inference_steps: int = None,
device: str | torch.device = None,
sigmas: list[float] | None = None,
mu: float | None = None,
mu: float = None,
timesteps: list[float] | None = None,
) -> None:
):
"""
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
@@ -338,45 +317,43 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
is_timesteps_provided = timesteps is not None
if is_timesteps_provided:
timesteps = np.array(timesteps).astype(np.float32) # type: ignore
timesteps = np.array(timesteps).astype(np.float32)
if sigmas is None:
if timesteps is None:
timesteps = np.linspace( # type: ignore
self._sigma_to_t(self.sigma_max),
self._sigma_to_t(self.sigma_min),
num_inference_steps,
timesteps = np.linspace(
self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
)
sigmas = timesteps / self.config.num_train_timesteps # type: ignore
sigmas = timesteps / self.config.num_train_timesteps
else:
sigmas = np.array(sigmas).astype(np.float32) # type: ignore
sigmas = np.array(sigmas).astype(np.float32)
num_inference_steps = len(sigmas)
# 2. Perform timestep shifting. Either no shifting is applied, or resolution-dependent shifting of
# "exponential" or "linear" type is applied
if self.config.use_dynamic_shifting:
sigmas = self.time_shift(mu, 1.0, sigmas) # type: ignore
sigmas = self.time_shift(mu, 1.0, sigmas)
else:
sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas) # type: ignore
sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas)
# 3. If required, stretch the sigmas schedule to terminate at the configured `shift_terminal` value
if self.config.shift_terminal:
sigmas = self.stretch_shift_to_terminal(sigmas) # type: ignore
sigmas = self.stretch_shift_to_terminal(sigmas)
# 4. If required, convert sigmas to one of karras, exponential, or beta sigma schedules
if self.config.use_karras_sigmas:
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
elif self.config.use_exponential_sigmas:
sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
elif self.config.use_beta_sigmas:
sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
# 5. Convert sigmas and timesteps to tensors and move to specified device
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) # type: ignore
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
if not is_timesteps_provided:
timesteps = sigmas * self.config.num_train_timesteps # type: ignore
timesteps = sigmas * self.config.num_train_timesteps
else:
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device) # type: ignore
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device)
# 6. Append the terminal sigma value.
# If a model requires inverted sigma schedule for denoising but timesteps without inversion, the
@@ -393,11 +370,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
self._step_index = None
self._begin_index = None
def index_for_timestep(
self,
timestep: float | torch.Tensor,
schedule_timesteps: torch.Tensor | None = None,
) -> int:
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -409,9 +382,9 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
# case we start in the middle of the denoising schedule (e.g. for image-to-image)
pos = 1 if len(indices) > 1 else 0
return int(indices[pos].item())
return indices[pos].item()
def _init_step_index(self, timestep: float | torch.Tensor) -> None:
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -486,12 +459,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
size = [round(self._scale_factors[self._step_index] * size) for size in self._init_size]
x0_pred = torch.nn.functional.interpolate(x0_pred, size=size, mode=self._upscale_mode)
noise = randn_tensor(
x0_pred.shape,
generator=generator,
device=x0_pred.device,
dtype=x0_pred.dtype,
)
noise = randn_tensor(x0_pred.shape, generator=generator, device=x0_pred.device, dtype=x0_pred.dtype)
prev_sample = (1 - sigma_next) * x0_pred + sigma_next * noise
# upon completion increase step index by one
@@ -505,7 +473,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
return FlowMatchLCMSchedulerOutput(prev_sample=prev_sample)
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
@@ -626,15 +594,11 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
)
return sigmas
def _time_shift_exponential(
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
) -> float | np.ndarray | torch.Tensor:
def _time_shift_exponential(self, mu, sigma, t):
return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
def _time_shift_linear(
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
) -> float | np.ndarray | torch.Tensor:
def _time_shift_linear(self, mu, sigma, t):
return mu / (mu + (1 / t - 1) ** sigma)
def __len__(self) -> int:
def __len__(self):
return self.config.num_train_timesteps

View File

@@ -81,7 +81,7 @@ class TorchCompileTesterMixin:
_ = model(**inputs_dict)
@torch.no_grad()
def test_torch_compile_repeated_blocks(self, recompile_limit=1):
def test_torch_compile_repeated_blocks(self):
if self.model_class._repeated_blocks is None:
pytest.skip("Skipping test as the model class doesn't have `_repeated_blocks` set.")
@@ -92,6 +92,7 @@ class TorchCompileTesterMixin:
model.eval()
model.compile_repeated_blocks(fullgraph=True)
recompile_limit = 1
if self.model_class.__name__ == "UNet2DConditionModel":
recompile_limit = 2

View File

@@ -375,7 +375,7 @@ class LoraHotSwappingForModelTesterMixin:
# additionally check if dynamic compilation works.
if different_shapes is not None:
for height, width in different_shapes:
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
_ = model(**new_inputs_dict)
else:
output0_after = model(**inputs_dict)["sample"]
@@ -390,7 +390,7 @@ class LoraHotSwappingForModelTesterMixin:
with torch.inference_mode():
if different_shapes is not None:
for height, width in different_shapes:
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
_ = model(**new_inputs_dict)
else:
output1_after = model(**inputs_dict)["sample"]

View File

@@ -628,21 +628,6 @@ class BitsAndBytesTesterMixin(BitsAndBytesConfigMixin, QuantizationTesterMixin):
"""Test that quantized models can be used for training with adapters."""
self._test_quantization_training(BitsAndBytesConfigMixin.BNB_CONFIGS["4bit_nf4"])
@pytest.mark.parametrize(
"config_name",
list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
ids=list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
)
def test_cpu_device_map(self, config_name):
config_kwargs = BitsAndBytesConfigMixin.BNB_CONFIGS[config_name]
model_quantized = self._create_quantized_model(config_kwargs, device_map="cpu")
assert hasattr(model_quantized, "hf_device_map"), "Model should have hf_device_map attribute"
assert model_quantized.hf_device_map is not None, "hf_device_map should not be None"
assert model_quantized.device == torch.device("cpu"), (
f"Model should be on CPU, but is on {model_quantized.device}"
)
@is_quantization
@is_quanto

View File

@@ -147,7 +147,22 @@ class TestWanVACETransformer3DCompile(WanVACETransformer3DTesterConfig, TorchCom
def test_torch_compile_repeated_blocks(self):
# WanVACE has two block types (WanTransformerBlock and WanVACETransformerBlock),
# so we need recompile_limit=2 instead of the default 1.
super().test_torch_compile_repeated_blocks(recompile_limit=2)
import torch._dynamo
import torch._inductor.utils
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict).to(torch_device)
model.eval()
model.compile_repeated_blocks(fullgraph=True)
with (
torch._inductor.utils.fresh_inductor_cache(),
torch._dynamo.config.patch(recompile_limit=2),
):
_ = model(**inputs_dict)
_ = model(**inputs_dict)
class TestWanVACETransformer3DBitsAndBytes(WanVACETransformer3DTesterConfig, BitsAndBytesTesterMixin):

View File

@@ -158,10 +158,6 @@ class AllegroPipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTes
def test_save_load_optional_components(self):
pass
@unittest.skip("Decoding without tiling is not yet implemented")
def test_pipeline_with_accelerator_device_map(self):
pass
def test_inference(self):
device = "cpu"

View File

@@ -34,7 +34,9 @@ enable_full_determinism()
class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = KandinskyCombinedPipeline
params = ["prompt"]
params = [
"prompt",
]
batch_params = ["prompt", "negative_prompt"]
required_optional_params = [
"generator",
@@ -146,10 +148,6 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)
def test_dict_tuple_outputs_equivalent(self):
super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
@unittest.skip("Test not supported.")
def test_pipeline_with_accelerator_device_map(self):
pass
class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = KandinskyImg2ImgCombinedPipeline
@@ -266,10 +264,6 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te
def test_save_load_optional_components(self):
super().test_save_load_optional_components(expected_max_difference=5e-4)
@unittest.skip("Test not supported.")
def test_pipeline_with_accelerator_device_map(self):
pass
class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = KandinskyInpaintCombinedPipeline
@@ -390,7 +384,3 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te
def test_save_load_local(self):
super().test_save_load_local(expected_max_difference=5e-3)
@unittest.skip("Test not supported.")
def test_pipeline_with_accelerator_device_map(self):
pass

View File

@@ -36,7 +36,9 @@ enable_full_determinism()
class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = KandinskyV22CombinedPipeline
params = ["prompt"]
params = [
"prompt",
]
batch_params = ["prompt", "negative_prompt"]
required_optional_params = [
"generator",
@@ -68,7 +70,12 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
def get_dummy_inputs(self, device, seed=0):
prior_dummy = PriorDummies()
inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
inputs.update({"height": 64, "width": 64})
inputs.update(
{
"height": 64,
"width": 64,
}
)
return inputs
def test_kandinsky(self):
@@ -148,18 +155,12 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
def test_save_load_optional_components(self):
super().test_save_load_optional_components(expected_max_difference=5e-3)
@unittest.skip("Test not supported.")
def test_callback_inputs(self):
pass
@unittest.skip("Test not supported.")
def test_callback_cfg(self):
pass
@unittest.skip("Test not supported.")
def test_pipeline_with_accelerator_device_map(self):
pass
class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = KandinskyV22Img2ImgCombinedPipeline
@@ -278,18 +279,12 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest
def save_load_local(self):
super().test_save_load_local(expected_max_difference=5e-3)
@unittest.skip("Test not supported.")
def test_callback_inputs(self):
pass
@unittest.skip("Test not supported.")
def test_callback_cfg(self):
pass
@unittest.skip("Test not supported.")
def test_pipeline_with_accelerator_device_map(self):
pass
class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = KandinskyV22InpaintCombinedPipeline
@@ -416,7 +411,3 @@ class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest
def test_callback_cfg(self):
pass
@unittest.skip("`device_map` is not yet supported for connected pipelines.")
def test_pipeline_with_accelerator_device_map(self):
pass

View File

@@ -296,9 +296,6 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas
output = pipe(**inputs)[0]
assert output.abs().sum() == 0
def test_pipeline_with_accelerator_device_map(self):
super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
@slow
@require_torch_accelerator

View File

@@ -194,9 +194,6 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
def test_save_load_dduf(self):
super().test_save_load_dduf(atol=1e-3, rtol=1e-3)
def test_pipeline_with_accelerator_device_map(self):
super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
@slow
@require_torch_accelerator

View File

@@ -1,6 +1,7 @@
import unittest
import numpy as np
import pytest
import torch
from transformers import AutoTokenizer
from transformers.models.t5gemma.configuration_t5gemma import T5GemmaConfig, T5GemmaModuleConfig
@@ -10,11 +11,17 @@ from diffusers.models import AutoencoderDC, AutoencoderKL
from diffusers.models.transformers.transformer_prx import PRXTransformer2DModel
from diffusers.pipelines.prx.pipeline_prx import PRXPipeline
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import is_transformers_version
from ..pipeline_params import TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
@pytest.mark.xfail(
condition=is_transformers_version(">", "4.57.1"),
reason="See https://github.com/huggingface/diffusers/pull/12456#issuecomment-3424228544",
strict=False,
)
class PRXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = PRXPipeline
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}

View File

@@ -2355,6 +2355,7 @@ class PipelineTesterMixin:
f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
)
@require_torch_accelerator
def test_pipeline_with_accelerator_device_map(self, expected_max_difference=1e-4):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)

View File

@@ -342,7 +342,3 @@ class VisualClozePipelineFastTests(unittest.TestCase, PipelineTesterMixin):
self.assertLess(
max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
)
@unittest.skip("Test not supported.")
def test_pipeline_with_accelerator_device_map(self):
pass

View File

@@ -310,7 +310,3 @@ class VisualClozeGenerationPipelineFastTests(unittest.TestCase, PipelineTesterMi
@unittest.skip("Skipped due to missing layout_prompt. Needs further investigation.")
def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=0.0001, rtol=0.0001):
pass
@unittest.skip("Needs to be revisited later.")
def test_pipeline_with_accelerator_device_map(self, expected_max_difference=0.0001):
pass