mirror of
https://github.com/huggingface/diffusers.git
synced 2026-03-02 23:00:39 +08:00
Compare commits
5 Commits
requiremen
...
dynamic-mo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1df740aac | ||
|
|
8d20369792 | ||
|
|
5910a1cc6c | ||
|
|
40e96454f1 | ||
|
|
47455bd133 |
@@ -856,7 +856,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
)
|
||||
state_dict = {k: v for k, v in state_dict.items() if not k.startswith("text_encoders.t5xxl.transformer.")}
|
||||
|
||||
has_diffb = any("diff_b" in k and k.startswith(("lora_unet_", "lora_te_")) for k in state_dict)
|
||||
has_diffb = any("diff_b" in k and k.startswith(("lora_unet_", "lora_te_", "lora_te1_")) for k in state_dict)
|
||||
if has_diffb:
|
||||
zero_status_diff_b = state_dict_all_zero(state_dict, ".diff_b")
|
||||
if zero_status_diff_b:
|
||||
@@ -895,7 +895,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
state_dict = {
|
||||
_custom_replace(k, limit_substrings): v
|
||||
for k, v in state_dict.items()
|
||||
if k.startswith(("lora_unet_", "lora_te_"))
|
||||
if k.startswith(("lora_unet_", "lora_te_", "lora_te1_"))
|
||||
}
|
||||
|
||||
if any("text_projection" in k for k in state_dict):
|
||||
|
||||
@@ -733,7 +733,7 @@ def _wrapped_flash_attn_3(
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Hardcoded for now because pytorch does not support tuple/int type hints
|
||||
window_size = (-1, -1)
|
||||
out, lse, *_ = flash_attn_3_func(
|
||||
result = flash_attn_3_func(
|
||||
q=q,
|
||||
k=k,
|
||||
v=v,
|
||||
@@ -750,7 +750,9 @@ def _wrapped_flash_attn_3(
|
||||
pack_gqa=pack_gqa,
|
||||
deterministic=deterministic,
|
||||
sm_margin=sm_margin,
|
||||
return_attn_probs=True,
|
||||
)
|
||||
out, lse, *_ = result
|
||||
lse = lse.permute(0, 2, 1)
|
||||
return out, lse
|
||||
|
||||
@@ -2701,7 +2703,7 @@ def _flash_varlen_attention_3(
|
||||
key_packed = torch.cat(key_valid, dim=0)
|
||||
value_packed = torch.cat(value_valid, dim=0)
|
||||
|
||||
out, lse, *_ = flash_attn_3_varlen_func(
|
||||
result = flash_attn_3_varlen_func(
|
||||
q=query_packed,
|
||||
k=key_packed,
|
||||
v=value_packed,
|
||||
@@ -2711,7 +2713,13 @@ def _flash_varlen_attention_3(
|
||||
max_seqlen_k=max_seqlen_k,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
if isinstance(result, tuple):
|
||||
out, lse, *_ = result
|
||||
else:
|
||||
out = result
|
||||
lse = None
|
||||
out = out.unflatten(0, (batch_size, -1))
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
@@ -699,9 +699,13 @@ class LTX2ImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraL
|
||||
mask_shape = (batch_size, 1, num_frames, height, width)
|
||||
|
||||
if latents is not None:
|
||||
conditioning_mask = latents.new_zeros(mask_shape)
|
||||
conditioning_mask[:, :, 0] = 1.0
|
||||
if latents.ndim == 5:
|
||||
# conditioning_mask needs to the same shape as latents in two stages generation.
|
||||
batch_size, _, num_frames, height, width = latents.shape
|
||||
mask_shape = (batch_size, 1, num_frames, height, width)
|
||||
conditioning_mask = latents.new_zeros(mask_shape)
|
||||
conditioning_mask[:, :, 0] = 1.0
|
||||
|
||||
latents = self._normalize_latents(
|
||||
latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
|
||||
)
|
||||
@@ -710,6 +714,9 @@ class LTX2ImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraL
|
||||
latents = self._pack_latents(
|
||||
latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
|
||||
)
|
||||
else:
|
||||
conditioning_mask = latents.new_zeros(mask_shape)
|
||||
conditioning_mask[:, :, 0] = 1.0
|
||||
conditioning_mask = self._pack_latents(
|
||||
conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
|
||||
).squeeze(-1)
|
||||
|
||||
@@ -299,7 +299,10 @@ def get_cached_module_file(
|
||||
# Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
|
||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||
|
||||
module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
|
||||
if subfolder is not None:
|
||||
module_file_or_url = os.path.join(pretrained_model_name_or_path, subfolder, module_file)
|
||||
else:
|
||||
module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
|
||||
|
||||
if os.path.isfile(module_file_or_url):
|
||||
resolved_module_file = module_file_or_url
|
||||
@@ -384,7 +387,11 @@ def get_cached_module_file(
|
||||
if not os.path.exists(submodule_path / module_folder):
|
||||
os.makedirs(submodule_path / module_folder)
|
||||
module_needed = f"{module_needed}.py"
|
||||
shutil.copyfile(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
|
||||
if subfolder is not None:
|
||||
source_path = os.path.join(pretrained_model_name_or_path, subfolder, module_needed)
|
||||
else:
|
||||
source_path = os.path.join(pretrained_model_name_or_path, module_needed)
|
||||
shutil.copyfile(source_path, submodule_path / module_needed)
|
||||
else:
|
||||
# Get the commit hash
|
||||
# TODO: we will get this info in the etag soon, so retrieve it from there and not here.
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import torch
|
||||
from transformers import CLIPTextModel, LongformerModel
|
||||
|
||||
from diffusers.models import AutoModel, UNet2DConditionModel
|
||||
@@ -35,6 +39,45 @@ class TestAutoModel(unittest.TestCase):
|
||||
)
|
||||
assert isinstance(model, CLIPTextModel)
|
||||
|
||||
def test_load_dynamic_module_from_local_path_with_subfolder(self):
|
||||
CUSTOM_MODEL_CODE = (
|
||||
"import torch\n"
|
||||
"from diffusers import ModelMixin, ConfigMixin\n"
|
||||
"from diffusers.configuration_utils import register_to_config\n"
|
||||
"\n"
|
||||
"class CustomModel(ModelMixin, ConfigMixin):\n"
|
||||
" @register_to_config\n"
|
||||
" def __init__(self, hidden_size=8):\n"
|
||||
" super().__init__()\n"
|
||||
" self.linear = torch.nn.Linear(hidden_size, hidden_size)\n"
|
||||
"\n"
|
||||
" def forward(self, x):\n"
|
||||
" return self.linear(x)\n"
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
subfolder = "custom_model"
|
||||
model_dir = os.path.join(tmpdir, subfolder)
|
||||
os.makedirs(model_dir)
|
||||
|
||||
with open(os.path.join(model_dir, "modeling.py"), "w") as f:
|
||||
f.write(CUSTOM_MODEL_CODE)
|
||||
|
||||
config = {
|
||||
"_class_name": "CustomModel",
|
||||
"_diffusers_version": "0.0.0",
|
||||
"auto_map": {"AutoModel": "modeling.CustomModel"},
|
||||
"hidden_size": 8,
|
||||
}
|
||||
with open(os.path.join(model_dir, "config.json"), "w") as f:
|
||||
json.dump(config, f)
|
||||
|
||||
torch.save({}, os.path.join(model_dir, "diffusion_pytorch_model.bin"))
|
||||
|
||||
model = AutoModel.from_pretrained(tmpdir, subfolder=subfolder, trust_remote_code=True)
|
||||
assert model.__class__.__name__ == "CustomModel"
|
||||
assert model.config["hidden_size"] == 8
|
||||
|
||||
|
||||
class TestAutoModelFromConfig(unittest.TestCase):
|
||||
@patch(
|
||||
|
||||
@@ -24,7 +24,8 @@ from diffusers import (
|
||||
LTX2ImageToVideoPipeline,
|
||||
LTX2VideoTransformer3DModel,
|
||||
)
|
||||
from diffusers.pipelines.ltx2 import LTX2TextConnectors
|
||||
from diffusers.pipelines.ltx2 import LTX2LatentUpsamplePipeline, LTX2TextConnectors
|
||||
from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
|
||||
from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder
|
||||
|
||||
from ...testing_utils import enable_full_determinism
|
||||
@@ -174,6 +175,15 @@ class LTX2ImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_upsample_component(self, in_channels=4, mid_channels=32, num_blocks_per_stage=1):
|
||||
upsampler = LTX2LatentUpsamplerModel(
|
||||
in_channels=in_channels,
|
||||
mid_channels=mid_channels,
|
||||
num_blocks_per_stage=num_blocks_per_stage,
|
||||
)
|
||||
|
||||
return upsampler
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
@@ -287,5 +297,60 @@ class LTX2ImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
assert torch.allclose(expected_video_slice, generated_video_slice, atol=1e-4, rtol=1e-4)
|
||||
assert torch.allclose(expected_audio_slice, generated_audio_slice, atol=1e-4, rtol=1e-4)
|
||||
|
||||
def test_two_stages_inference_with_upsampler(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["output_type"] = "latent"
|
||||
first_stage_output = pipe(**inputs)
|
||||
video_latent = first_stage_output.frames
|
||||
audio_latent = first_stage_output.audio
|
||||
|
||||
self.assertEqual(video_latent.shape, (1, 4, 3, 16, 16))
|
||||
self.assertEqual(audio_latent.shape, (1, 2, 5, 2))
|
||||
self.assertEqual(audio_latent.shape[1], components["vocoder"].config.out_channels)
|
||||
|
||||
upsampler = self.get_dummy_upsample_component(in_channels=video_latent.shape[1])
|
||||
upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=upsampler)
|
||||
upscaled_video_latent = upsample_pipe(latents=video_latent, output_type="latent", return_dict=False)[0]
|
||||
self.assertEqual(upscaled_video_latent.shape, (1, 4, 3, 32, 32))
|
||||
|
||||
inputs["latents"] = upscaled_video_latent
|
||||
inputs["audio_latents"] = audio_latent
|
||||
inputs["output_type"] = "pt"
|
||||
second_stage_output = pipe(**inputs)
|
||||
video = second_stage_output.frames
|
||||
audio = second_stage_output.audio
|
||||
|
||||
self.assertEqual(video.shape, (1, 5, 3, 64, 64))
|
||||
self.assertEqual(audio.shape[0], 1)
|
||||
self.assertEqual(audio.shape[1], components["vocoder"].config.out_channels)
|
||||
|
||||
# fmt: off
|
||||
expected_video_slice = torch.tensor(
|
||||
[
|
||||
0.4497, 0.6757, 0.4219, 0.7686, 0.4525, 0.6483, 0.3969, 0.7404, 0.3541, 0.3039, 0.4592, 0.3521, 0.3665, 0.2785, 0.3336, 0.3079
|
||||
]
|
||||
)
|
||||
expected_audio_slice = torch.tensor(
|
||||
[
|
||||
0.0271, 0.0492, 0.1249, 0.1126, 0.1661, 0.1060, 0.1717, 0.0944, 0.0672, -0.0069, 0.0688, 0.0097, 0.0808, 0.1231, 0.0986, 0.0739
|
||||
]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
video = video.flatten()
|
||||
audio = audio.flatten()
|
||||
generated_video_slice = torch.cat([video[:8], video[-8:]])
|
||||
generated_audio_slice = torch.cat([audio[:8], audio[-8:]])
|
||||
|
||||
assert torch.allclose(expected_video_slice, generated_video_slice, atol=1e-4, rtol=1e-4)
|
||||
assert torch.allclose(expected_audio_slice, generated_audio_slice, atol=1e-4, rtol=1e-4)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=2e-2)
|
||||
|
||||
Reference in New Issue
Block a user