fix copies

remove k-diffusion
2026-02-18 00:38:40 +08:00 · 2026-02-16 13:10:19 +05:30 · 2026-02-16 12:50:25 +05:30
17 changed files with 43 additions and 125 deletions
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -114,7 +114,7 @@ jobs:

    - name: Install dependencies
      run: |
-        uv pip install -e ".[quality,test]"
+        uv pip install -e ".[quality]"
        #uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
        uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
        uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
@@ -191,7 +191,7 @@ jobs:

    - name: Install dependencies
      run: |
-        uv pip install -e ".[quality,test]"
+        uv pip install -e ".[quality]"

    - name: Environment
      run: |
@@ -242,7 +242,7 @@ jobs:

    - name: Install dependencies
      run: |
-        uv pip install -e ".[quality,test]"
+        uv pip install -e ".[quality]"
        # TODO (sayakpaul, DN6): revisit `--no-deps`
        uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
        uv pip install -U tokenizers
--- a/setup.py
+++ b/setup.py
@@ -101,7 +101,6 @@ _deps = [
    "datasets",
    "filelock",
    "flax>=0.4.1",
-    "ftfy",
    "hf-doc-builder>=0.3.0",
    "httpx<1.0.0",
    "huggingface-hub>=0.34.0,<2.0",
@@ -222,14 +221,12 @@ extras["docs"] = deps_list("hf-doc-builder")
 extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft", "timm")
 extras["test"] = deps_list(
    "compel",
-    "ftfy",
    "GitPython",
    "datasets",
    "Jinja2",
    "invisible-watermark",
    "librosa",
    "parameterized",
-    "protobuf",
    "pytest",
    "pytest-timeout",
    "pytest-xdist",
@@ -238,7 +235,6 @@ extras["test"] = deps_list(
    "sentencepiece",
    "scipy",
    "tiktoken",
-    "torchsde",
    "torchvision",
    "transformers",
    "phonemizer",
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -8,7 +8,6 @@ deps = {
    "datasets": "datasets",
    "filelock": "filelock",
    "flax": "flax>=0.4.1",
-    "ftfy": "ftfy",
    "hf-doc-builder": "hf-doc-builder>=0.3.0",
    "httpx": "httpx<1.0.0",
    "huggingface-hub": "huggingface-hub>=0.34.0,<2.0",
--- a/src/diffusers/models/attention_dispatch.py
+++ b/src/diffusers/models/attention_dispatch.py
@@ -1117,26 +1117,6 @@ def _sage_attention_backward_op(
    raise NotImplementedError("Backward pass is not implemented for Sage attention.")


-def _maybe_modify_attn_mask_npu(query: torch.Tensor, key: torch.Tensor, attn_mask: torch.Tensor | None = None):
-    # Skip Attention Mask if all values are 1, `None` mask can speedup the computation
-    if attn_mask is not None and torch.all(attn_mask != 0):
-        attn_mask = None
-
-    # Reshape Attention Mask: [batch_size, seq_len_k] -> [batch_size, 1, sqe_len_q, seq_len_k]
-    # https://www.hiascend.com/document/detail/zh/Pytorch/730/apiref/torchnpuCustomsapi/docs/context/torch_npu-npu_fusion_attention.md
-    if (
-        attn_mask is not None
-        and attn_mask.ndim == 2
-        and attn_mask.shape[0] == query.shape[0]
-        and attn_mask.shape[1] == key.shape[1]
-    ):
-        B, Sq, Skv = attn_mask.shape[0], query.shape[1], key.shape[1]
-        attn_mask = ~attn_mask.to(torch.bool)
-        attn_mask = attn_mask.unsqueeze(1).expand(B, Sq, Skv).unsqueeze(1).contiguous()
-
-    return attn_mask
-
-
 def _npu_attention_forward_op(
    ctx: torch.autograd.function.FunctionCtx,
    query: torch.Tensor,
@@ -1154,14 +1134,11 @@ def _npu_attention_forward_op(
    if return_lse:
        raise ValueError("NPU attention backend does not support setting `return_lse=True`.")

-    attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
-
    out = npu_fusion_attention(
        query,
        key,
        value,
        query.size(2),  # num_heads
-        atten_mask=attn_mask,
        input_layout="BSND",
        pse=None,
        scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
@@ -2691,17 +2668,16 @@ def _native_npu_attention(
    return_lse: bool = False,
    _parallel_config: "ParallelConfig" | None = None,
 ) -> torch.Tensor:
+    if attn_mask is not None:
+        raise ValueError("`attn_mask` is not supported for NPU attention")
    if return_lse:
        raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
    if _parallel_config is None:
-        attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
-
        out = npu_fusion_attention(
            query,
            key,
            value,
            query.size(2),  # num_heads
-            atten_mask=attn_mask,
            input_layout="BSND",
            pse=None,
            scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
@@ -2716,7 +2692,7 @@ def _native_npu_attention(
            query,
            key,
            value,
-            attn_mask,
+            None,
            dropout_p,
            None,
            scale,
--- a/src/diffusers/models/transformers/transformer_qwenimage.py
+++ b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -164,11 +164,7 @@ def compute_text_seq_len_from_mask(
    position_ids = torch.arange(text_seq_len, device=encoder_hidden_states.device, dtype=torch.long)
    active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
    has_active = encoder_hidden_states_mask.any(dim=1)
-    per_sample_len = torch.where(
-        has_active,
-        active_positions.max(dim=1).values + 1,
-        torch.as_tensor(text_seq_len, device=encoder_hidden_states.device),
-    )
+    per_sample_len = torch.where(has_active, active_positions.max(dim=1).values + 1, torch.as_tensor(text_seq_len))
    return text_seq_len, per_sample_len, encoder_hidden_states_mask


--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -112,7 +112,7 @@ LIBRARIES = []
 for library in LOADABLE_CLASSES:
    LIBRARIES.append(library)

-SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]
+SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()]

 logger = logging.get_logger(__name__)

@@ -468,7 +468,8 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        pipeline_is_sequentially_offloaded = any(
            module_is_sequentially_offloaded(module) for _, module in self.components.items()
        )
-        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
+
+        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
@@ -1187,7 +1188,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        """
        self._maybe_raise_error_if_group_offload_active(raise_error=True)

-        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
+        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
@@ -1311,7 +1312,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
        self.remove_all_hooks()

-        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
+        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
@@ -2227,21 +2228,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                return True
        return False

-    def _is_pipeline_device_mapped(self):
-        # We support passing `device_map="cuda"`, for example. This is helpful, in case
-        # users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
-        # in limited VRAM environments because quantized models often initialize directly on the accelerator.
-        device_map = self.hf_device_map
-        is_device_type_map = False
-        if isinstance(device_map, str):
-            try:
-                torch.device(device_map)
-                is_device_type_map = True
-            except RuntimeError:
-                pass
-
-        return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
-

 class StableDiffusionMixin:
    r"""
--- a/tests/models/testing_utils/compile.py
+++ b/tests/models/testing_utils/compile.py
@@ -81,7 +81,7 @@ class TorchCompileTesterMixin:
            _ = model(**inputs_dict)

    @torch.no_grad()
-    def test_torch_compile_repeated_blocks(self, recompile_limit=1):
+    def test_torch_compile_repeated_blocks(self):
        if self.model_class._repeated_blocks is None:
            pytest.skip("Skipping test as the model class doesn't have `_repeated_blocks` set.")

@@ -92,6 +92,7 @@ class TorchCompileTesterMixin:
        model.eval()
        model.compile_repeated_blocks(fullgraph=True)

+        recompile_limit = 1
        if self.model_class.__name__ == "UNet2DConditionModel":
            recompile_limit = 2

--- a/tests/models/testing_utils/quantization.py
+++ b/tests/models/testing_utils/quantization.py
@@ -628,21 +628,6 @@ class BitsAndBytesTesterMixin(BitsAndBytesConfigMixin, QuantizationTesterMixin):
        """Test that quantized models can be used for training with adapters."""
        self._test_quantization_training(BitsAndBytesConfigMixin.BNB_CONFIGS["4bit_nf4"])

-    @pytest.mark.parametrize(
-        "config_name",
-        list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
-        ids=list(BitsAndBytesConfigMixin.BNB_CONFIGS.keys()),
-    )
-    def test_cpu_device_map(self, config_name):
-        config_kwargs = BitsAndBytesConfigMixin.BNB_CONFIGS[config_name]
-        model_quantized = self._create_quantized_model(config_kwargs, device_map="cpu")
-
-        assert hasattr(model_quantized, "hf_device_map"), "Model should have hf_device_map attribute"
-        assert model_quantized.hf_device_map is not None, "hf_device_map should not be None"
-        assert model_quantized.device == torch.device("cpu"), (
-            f"Model should be on CPU, but is on {model_quantized.device}"
-        )
-

@is_quantization
@is_quanto
--- a/tests/models/transformers/test_models_transformer_wan_vace.py
+++ b/tests/models/transformers/test_models_transformer_wan_vace.py
@@ -147,7 +147,22 @@ class TestWanVACETransformer3DCompile(WanVACETransformer3DTesterConfig, TorchCom
    def test_torch_compile_repeated_blocks(self):
        # WanVACE has two block types (WanTransformerBlock and WanVACETransformerBlock),
        # so we need recompile_limit=2 instead of the default 1.
-        super().test_torch_compile_repeated_blocks(recompile_limit=2)
+        import torch._dynamo
+        import torch._inductor.utils
+
+        init_dict = self.get_init_dict()
+        inputs_dict = self.get_dummy_inputs()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model.eval()
+        model.compile_repeated_blocks(fullgraph=True)
+
+        with (
+            torch._inductor.utils.fresh_inductor_cache(),
+            torch._dynamo.config.patch(recompile_limit=2),
+        ):
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)


 class TestWanVACETransformer3DBitsAndBytes(WanVACETransformer3DTesterConfig, BitsAndBytesTesterMixin):
--- a/tests/pipelines/allegro/test_allegro.py
+++ b/tests/pipelines/allegro/test_allegro.py
@@ -158,10 +158,6 @@ class AllegroPipelineFastTests(PipelineTesterMixin, PyramidAttentionBroadcastTes
    def test_save_load_optional_components(self):
        pass

-    @unittest.skip("Decoding without tiling is not yet implemented")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
-
    def test_inference(self):
        device = "cpu"

--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -34,7 +34,9 @@ enable_full_determinism()

 class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyCombinedPipeline
-    params = ["prompt"]
+    params = [
+        "prompt",
+    ]
    batch_params = ["prompt", "negative_prompt"]
    required_optional_params = [
        "generator",
@@ -146,10 +148,6 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)
    def test_dict_tuple_outputs_equivalent(self):
        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)

-    @unittest.skip("Test not supported.")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
-

 class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyImg2ImgCombinedPipeline
@@ -266,10 +264,6 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te
    def test_save_load_optional_components(self):
        super().test_save_load_optional_components(expected_max_difference=5e-4)

-    @unittest.skip("Test not supported.")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
-

 class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyInpaintCombinedPipeline
@@ -390,7 +384,3 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te

    def test_save_load_local(self):
        super().test_save_load_local(expected_max_difference=5e-3)
-
-    @unittest.skip("Test not supported.")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -36,7 +36,9 @@ enable_full_determinism()

 class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyV22CombinedPipeline
-    params = ["prompt"]
+    params = [
+        "prompt",
+    ]
    batch_params = ["prompt", "negative_prompt"]
    required_optional_params = [
        "generator",
@@ -68,7 +70,12 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
    def get_dummy_inputs(self, device, seed=0):
        prior_dummy = PriorDummies()
        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
-        inputs.update({"height": 64, "width": 64})
+        inputs.update(
+            {
+                "height": 64,
+                "width": 64,
+            }
+        )
        return inputs

    def test_kandinsky(self):
@@ -148,18 +155,12 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
    def test_save_load_optional_components(self):
        super().test_save_load_optional_components(expected_max_difference=5e-3)

-    @unittest.skip("Test not supported.")
    def test_callback_inputs(self):
        pass

-    @unittest.skip("Test not supported.")
    def test_callback_cfg(self):
        pass

-    @unittest.skip("Test not supported.")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
-

 class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyV22Img2ImgCombinedPipeline
@@ -278,18 +279,12 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest
    def save_load_local(self):
        super().test_save_load_local(expected_max_difference=5e-3)

-    @unittest.skip("Test not supported.")
    def test_callback_inputs(self):
        pass

-    @unittest.skip("Test not supported.")
    def test_callback_cfg(self):
        pass

-    @unittest.skip("Test not supported.")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
-

 class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyV22InpaintCombinedPipeline
@@ -416,7 +411,3 @@ class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest

    def test_callback_cfg(self):
        pass
-
-    @unittest.skip("`device_map` is not yet supported for connected pipelines.")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_inpaint.py
@@ -296,9 +296,6 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        output = pipe(**inputs)[0]
        assert output.abs().sum() == 0

-    def test_pipeline_with_accelerator_device_map(self):
-        super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
-

@slow
@require_torch_accelerator
--- a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py
+++ b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py
@@ -194,9 +194,6 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
    def test_save_load_dduf(self):
        super().test_save_load_dduf(atol=1e-3, rtol=1e-3)

-    def test_pipeline_with_accelerator_device_map(self):
-        super().test_pipeline_with_accelerator_device_map(expected_max_difference=5e-3)
-

@slow
@require_torch_accelerator
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -2355,6 +2355,7 @@ class PipelineTesterMixin:
                    f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
                )

+    @require_torch_accelerator
    def test_pipeline_with_accelerator_device_map(self, expected_max_difference=1e-4):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
--- a/tests/pipelines/visualcloze/test_pipeline_visualcloze_combined.py
+++ b/tests/pipelines/visualcloze/test_pipeline_visualcloze_combined.py
@@ -342,7 +342,3 @@ class VisualClozePipelineFastTests(unittest.TestCase, PipelineTesterMixin):
        self.assertLess(
            max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
        )
-
-    @unittest.skip("Test not supported.")
-    def test_pipeline_with_accelerator_device_map(self):
-        pass
--- a/tests/pipelines/visualcloze/test_pipeline_visualcloze_generation.py
+++ b/tests/pipelines/visualcloze/test_pipeline_visualcloze_generation.py
@@ -310,7 +310,3 @@ class VisualClozeGenerationPipelineFastTests(unittest.TestCase, PipelineTesterMi
    @unittest.skip("Skipped due to missing layout_prompt. Needs further investigation.")
    def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=0.0001, rtol=0.0001):
        pass
-
-    @unittest.skip("Needs to be revisited later.")
-    def test_pipeline_with_accelerator_device_map(self, expected_max_difference=0.0001):
-        pass
Author	SHA1	Message	Date
DN6	36c0d78b8b	fix copies	2026-02-16 13:10:19 +05:30
DN6	66f6f8b926	remove k-diffusion	2026-02-16 12:50:25 +05:30