mirror of
https://github.com/huggingface/diffusers.git
synced 2026-02-18 00:38:40 +08:00
Compare commits
2 Commits
add-ftfy-t
...
deprecate-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
36c0d78b8b | ||
|
|
66f6f8b926 |
6
.github/workflows/pr_tests.yml
vendored
6
.github/workflows/pr_tests.yml
vendored
@@ -114,7 +114,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -e ".[quality,test]"
|
||||
uv pip install -e ".[quality]"
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
@@ -191,7 +191,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -e ".[quality,test]"
|
||||
uv pip install -e ".[quality]"
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -242,7 +242,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -e ".[quality,test]"
|
||||
uv pip install -e ".[quality]"
|
||||
# TODO (sayakpaul, DN6): revisit `--no-deps`
|
||||
uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
|
||||
uv pip install -U tokenizers
|
||||
|
||||
4
setup.py
4
setup.py
@@ -101,7 +101,6 @@ _deps = [
|
||||
"datasets",
|
||||
"filelock",
|
||||
"flax>=0.4.1",
|
||||
"ftfy",
|
||||
"hf-doc-builder>=0.3.0",
|
||||
"httpx<1.0.0",
|
||||
"huggingface-hub>=0.34.0,<2.0",
|
||||
@@ -222,14 +221,12 @@ extras["docs"] = deps_list("hf-doc-builder")
|
||||
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft", "timm")
|
||||
extras["test"] = deps_list(
|
||||
"compel",
|
||||
"ftfy",
|
||||
"GitPython",
|
||||
"datasets",
|
||||
"Jinja2",
|
||||
"invisible-watermark",
|
||||
"librosa",
|
||||
"parameterized",
|
||||
"protobuf",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
"pytest-xdist",
|
||||
@@ -238,7 +235,6 @@ extras["test"] = deps_list(
|
||||
"sentencepiece",
|
||||
"scipy",
|
||||
"tiktoken",
|
||||
"torchsde",
|
||||
"torchvision",
|
||||
"transformers",
|
||||
"phonemizer",
|
||||
|
||||
@@ -8,7 +8,6 @@ deps = {
|
||||
"datasets": "datasets",
|
||||
"filelock": "filelock",
|
||||
"flax": "flax>=0.4.1",
|
||||
"ftfy": "ftfy",
|
||||
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
||||
"httpx": "httpx<1.0.0",
|
||||
"huggingface-hub": "huggingface-hub>=0.34.0,<2.0",
|
||||
|
||||
@@ -1117,26 +1117,6 @@ def _sage_attention_backward_op(
|
||||
raise NotImplementedError("Backward pass is not implemented for Sage attention.")
|
||||
|
||||
|
||||
def _maybe_modify_attn_mask_npu(query: torch.Tensor, key: torch.Tensor, attn_mask: torch.Tensor | None = None):
|
||||
# Skip Attention Mask if all values are 1, `None` mask can speedup the computation
|
||||
if attn_mask is not None and torch.all(attn_mask != 0):
|
||||
attn_mask = None
|
||||
|
||||
# Reshape Attention Mask: [batch_size, seq_len_k] -> [batch_size, 1, sqe_len_q, seq_len_k]
|
||||
# https://www.hiascend.com/document/detail/zh/Pytorch/730/apiref/torchnpuCustomsapi/docs/context/torch_npu-npu_fusion_attention.md
|
||||
if (
|
||||
attn_mask is not None
|
||||
and attn_mask.ndim == 2
|
||||
and attn_mask.shape[0] == query.shape[0]
|
||||
and attn_mask.shape[1] == key.shape[1]
|
||||
):
|
||||
B, Sq, Skv = attn_mask.shape[0], query.shape[1], key.shape[1]
|
||||
attn_mask = ~attn_mask.to(torch.bool)
|
||||
attn_mask = attn_mask.unsqueeze(1).expand(B, Sq, Skv).unsqueeze(1).contiguous()
|
||||
|
||||
return attn_mask
|
||||
|
||||
|
||||
def _npu_attention_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
@@ -1154,14 +1134,11 @@ def _npu_attention_forward_op(
|
||||
if return_lse:
|
||||
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
|
||||
|
||||
attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
|
||||
|
||||
out = npu_fusion_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query.size(2), # num_heads
|
||||
atten_mask=attn_mask,
|
||||
input_layout="BSND",
|
||||
pse=None,
|
||||
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
|
||||
@@ -2691,17 +2668,16 @@ def _native_npu_attention(
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for NPU attention")
|
||||
if return_lse:
|
||||
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
|
||||
if _parallel_config is None:
|
||||
attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
|
||||
|
||||
out = npu_fusion_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query.size(2), # num_heads
|
||||
atten_mask=attn_mask,
|
||||
input_layout="BSND",
|
||||
pse=None,
|
||||
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
|
||||
@@ -2716,7 +2692,7 @@ def _native_npu_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
attn_mask,
|
||||
None,
|
||||
dropout_p,
|
||||
None,
|
||||
scale,
|
||||
|
||||
@@ -164,11 +164,7 @@ def compute_text_seq_len_from_mask(
|
||||
position_ids = torch.arange(text_seq_len, device=encoder_hidden_states.device, dtype=torch.long)
|
||||
active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
|
||||
has_active = encoder_hidden_states_mask.any(dim=1)
|
||||
per_sample_len = torch.where(
|
||||
has_active,
|
||||
active_positions.max(dim=1).values + 1,
|
||||
torch.as_tensor(text_seq_len, device=encoder_hidden_states.device),
|
||||
)
|
||||
per_sample_len = torch.where(has_active, active_positions.max(dim=1).values + 1, torch.as_tensor(text_seq_len))
|
||||
return text_seq_len, per_sample_len, encoder_hidden_states_mask
|
||||
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ LIBRARIES = []
|
||||
for library in LOADABLE_CLASSES:
|
||||
LIBRARIES.append(library)
|
||||
|
||||
SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]
|
||||
SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()]
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
@@ -468,7 +468,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
pipeline_is_sequentially_offloaded = any(
|
||||
module_is_sequentially_offloaded(module) for _, module in self.components.items()
|
||||
)
|
||||
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
|
||||
|
||||
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
|
||||
if is_pipeline_device_mapped:
|
||||
raise ValueError(
|
||||
"It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
|
||||
@@ -1187,7 +1188,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
"""
|
||||
self._maybe_raise_error_if_group_offload_active(raise_error=True)
|
||||
|
||||
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
|
||||
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
|
||||
if is_pipeline_device_mapped:
|
||||
raise ValueError(
|
||||
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
|
||||
@@ -1311,7 +1312,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
|
||||
self.remove_all_hooks()
|
||||
|
||||
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
|
||||
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
|
||||
if is_pipeline_device_mapped:
|
||||
raise ValueError(
|
||||
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
|
||||
@@ -2227,21 +2228,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_pipeline_device_mapped(self):
|
||||
# We support passing `device_map="cuda"`, for example. This is helpful, in case
|
||||
# users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
|
||||
# in limited VRAM environments because quantized models often initialize directly on the accelerator.
|
||||
device_map = self.hf_device_map
|
||||
is_device_type_map = False
|
||||
if isinstance(device_map, str):
|
||||
try:
|
||||
torch.device(device_map)
|
||||
is_device_type_map = True
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
|
||||
|
||||
|
||||
class StableDiffusionMixin:
|
||||
r"""
|
||||
|
||||
@@ -81,7 +81,7 @@ class TorchCompileTesterMixin:
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
@torch.no_grad()
|
||||
def test_torch_compile_repeated_blocks(self, recompile_limit=1):
|
||||
def test_torch_compile_repeated_blocks(self):
|
||||
if self.model_class._repeated_blocks is None:
|
||||
pytest.skip("Skipping test as the model class doesn't have `_repeated_blocks` set.")
|
||||
|
||||
@@ -92,6 +92,7 @@ class TorchCompileTesterMixin:
|
||||
model.eval()
|
||||
model.compile_repeated_blocks(fullgraph=True)
|
||||
|
||||
recompile_limit = 1
|
||||
if self.model_class.__name__ == "UNet2DConditionModel":
|
||||
recompile_limit = 2
|
||||
|
||||
|
||||
@@ -628,21 +628,6 @@ class BitsAndBytesTesterMixin(BitsAndBytesConfigMixin, QuantizationTesterMixin):
|
||||
"""Test that quantized models can be used for training with adapters."""
|
||||
self._test_quantization_training(BitsAndBytesConfigMixin.BNB_CONFIGS["4bit_nf4"])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"config_name",
|
||||
list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
|
||||
ids=list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
|
||||
)
|
||||
def test_cpu_device_map(self, config_name):
|
||||
config_kwargs = BitsAndBytesConfigMixin.BNB_CONFIGS[config_name]
|
||||
model_quantized = self._create_quantized_model(config_kwargs, device_map="cpu")
|
||||
|
||||
assert hasattr(model_quantized, "hf_device_map"), "Model should have hf_device_map attribute"
|
||||
assert model_quantized.hf_device_map is not None, "hf_device_map should not be None"
|
||||
assert model_quantized.device == torch.device("cpu"), (
|
||||
f"Model should be on CPU, but is on {model_quantized.device}"
|
||||
)
|
||||
|
||||
|
||||
@is_quantization
|
||||
@is_quanto
|
||||
|
||||
@@ -147,7 +147,22 @@ class TestWanVACETransformer3DCompile(WanVACETransformer3DTesterConfig, TorchCom
|
||||
def test_torch_compile_repeated_blocks(self):
|
||||
# WanVACE has two block types (WanTransformerBlock and WanVACETransformerBlock),
|
||||
# so we need recompile_limit=2 instead of the default 1.
|
||||
super().test_torch_compile_repeated_blocks(recompile_limit=2)
|
||||
import torch._dynamo
|
||||
import torch._inductor.utils
|
||||
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model.eval()
|
||||
model.compile_repeated_blocks(fullgraph=True)
|
||||
|
||||
with (
|
||||
torch._inductor.utils.fresh_inductor_cache(),
|
||||
torch._dynamo.config.patch(recompile_limit=2),
|
||||
):
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
|
||||
class TestWanVACETransformer3DBitsAndBytes(WanVACETransformer3DTesterConfig, BitsAndBytesTesterMixin):
|
||||
|
||||
@@ -158,10 +158,6 @@ class AllegroPipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTes
|
||||
def test_save_load_optional_components(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Decoding without tiling is not yet implemented")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
def test_inference(self):
|
||||
device = "cpu"
|
||||
|
||||
|
||||
@@ -34,7 +34,9 @@ enable_full_determinism()
|
||||
|
||||
class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = KandinskyCombinedPipeline
|
||||
params = ["prompt"]
|
||||
params = [
|
||||
"prompt",
|
||||
]
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
required_optional_params = [
|
||||
"generator",
|
||||
@@ -146,10 +148,6 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
|
||||
class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = KandinskyImg2ImgCombinedPipeline
|
||||
@@ -266,10 +264,6 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te
|
||||
def test_save_load_optional_components(self):
|
||||
super().test_save_load_optional_components(expected_max_difference=5e-4)
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
|
||||
class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = KandinskyInpaintCombinedPipeline
|
||||
@@ -390,7 +384,3 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te
|
||||
|
||||
def test_save_load_local(self):
|
||||
super().test_save_load_local(expected_max_difference=5e-3)
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
@@ -36,7 +36,9 @@ enable_full_determinism()
|
||||
|
||||
class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = KandinskyV22CombinedPipeline
|
||||
params = ["prompt"]
|
||||
params = [
|
||||
"prompt",
|
||||
]
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
required_optional_params = [
|
||||
"generator",
|
||||
@@ -68,7 +70,12 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
prior_dummy = PriorDummies()
|
||||
inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
|
||||
inputs.update({"height": 64, "width": 64})
|
||||
inputs.update(
|
||||
{
|
||||
"height": 64,
|
||||
"width": 64,
|
||||
}
|
||||
)
|
||||
return inputs
|
||||
|
||||
def test_kandinsky(self):
|
||||
@@ -148,18 +155,12 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
|
||||
def test_save_load_optional_components(self):
|
||||
super().test_save_load_optional_components(expected_max_difference=5e-3)
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_callback_inputs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_callback_cfg(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
|
||||
class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = KandinskyV22Img2ImgCombinedPipeline
|
||||
@@ -278,18 +279,12 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest
|
||||
def save_load_local(self):
|
||||
super().test_save_load_local(expected_max_difference=5e-3)
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_callback_inputs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_callback_cfg(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
|
||||
class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = KandinskyV22InpaintCombinedPipeline
|
||||
@@ -416,7 +411,3 @@ class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest
|
||||
|
||||
def test_callback_cfg(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("`device_map` is not yet supported for connected pipelines.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
@@ -296,9 +296,6 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
output = pipe(**inputs)[0]
|
||||
assert output.abs().sum() == 0
|
||||
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
|
||||
@@ -194,9 +194,6 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
def test_save_load_dduf(self):
|
||||
super().test_save_load_dduf(atol=1e-3, rtol=1e-3)
|
||||
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
|
||||
@@ -2355,6 +2355,7 @@ class PipelineTesterMixin:
|
||||
f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
|
||||
)
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_pipeline_with_accelerator_device_map(self, expected_max_difference=1e-4):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
|
||||
@@ -342,7 +342,3 @@ class VisualClozePipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
self.assertLess(
|
||||
max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
|
||||
)
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
@@ -310,7 +310,3 @@ class VisualClozeGenerationPipelineFastTests(unittest.TestCase, PipelineTesterMi
|
||||
@unittest.skip("Skipped due to missing layout_prompt. Needs further investigation.")
|
||||
def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=0.0001, rtol=0.0001):
|
||||
pass
|
||||
|
||||
@unittest.skip("Needs to be revisited later.")
|
||||
def test_pipeline_with_accelerator_device_map(self, expected_max_difference=0.0001):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user