update

Fix mixed variant downloading (#11611 )
* update * update
2025-12-27 23:08:57 +08:00 · 2025-05-26 22:14:28 +05:30 · 2025-05-26 21:43:48 +05:30 · 2025-05-26 08:31:04 -07:00
19 changed files with 84 additions and 532 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,7 +23,7 @@ jobs:
    runs-on:
      group: aws-g6-4xlarge-plus
    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
    steps:
      - name: Checkout diffusers
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -38,9 +38,16 @@ jobs:
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Build Changed Docker Images
+        env:
+          CHANGED_FILES:  "${{ steps.file_changes.outputs.all }}"
        run: |
-          CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
          for FILE in $CHANGED_FILES; do
+            # skip anything that isn’t still on disk
+            if [[ ! -f "$FILE" ]]; then
+              echo "Skipping removed file $FILE"
+              continue
+            fi
+            
            if [[ "$FILE" == docker/*Dockerfile ]]; then
              DOCKER_PATH="${FILE%/Dockerfile}"
              DOCKER_TAG=$(basename "$DOCKER_PATH")
@@ -65,7 +72,7 @@ jobs:
        image-name:
          - diffusers-pytorch-cpu
          - diffusers-pytorch-cuda
-          - diffusers-pytorch-compile-cuda
+          - diffusers-pytorch-cuda
          - diffusers-pytorch-xformers-cuda
          - diffusers-pytorch-minimum-cuda
          - diffusers-flax-cpu
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -188,7 +188,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -262,7 +262,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -316,7 +316,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-compile-cuda
+      image: diffusers/diffusers-pytorch-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/docker/diffusers-pytorch-compile-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -1,50 +0,0 @@
-FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
-LABEL maintainer="Hugging Face"
-LABEL repository="diffusers"
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa
-
-RUN apt install -y bash \
-    build-essential \
-    git \
-    git-lfs \
-    curl \
-    ca-certificates \
-    libsndfile1-dev \
-    libgl1 \
-    python3.10 \
-    python3.10-dev \
-    python3-pip \
-    python3.10-venv && \
-    rm -rf /var/lib/apt/lists
-
-# make sure to use venv
-RUN python3.10 -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
-    python3.10 -m uv pip install --no-cache-dir \
-    torch \
-    torchvision \
-    torchaudio \
-    invisible_watermark && \
-    python3.10 -m pip install --no-cache-dir \
-    accelerate \
-    datasets \
-    hf-doc-builder \
-    huggingface-hub \
-    hf_transfer \
-    Jinja2 \
-    librosa \
-    numpy==1.26.4 \
-    scipy \
-    tensorboard \
-    transformers \
-    hf_transfer
-
-CMD ["/bin/bash"]
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -146,21 +146,27 @@ def is_safetensors_compatible(filenames, passed_components=None, folder_names=No
        components[component].append(component_filename)

    # If there are no component folders check the main directory for safetensors files
+    filtered_filenames = set()
    if not components:
        if variant is not None:
            filtered_filenames = filter_with_regex(filenames, variant_file_re)
-        else:
+
+        # If no variant filenames exist check if non-variant files are available
+        if not filtered_filenames:
            filtered_filenames = filter_with_regex(filenames, non_variant_file_re)
        return any(".safetensors" in filename for filename in filtered_filenames)

    # iterate over all files of a component
    # check if safetensor files exist for that component
-    # if variant is provided check if the variant of the safetensors exists
    for component, component_filenames in components.items():
        matches = []
+        filtered_component_filenames = set()
+        # if variant is provided check if the variant of the safetensors exists
        if variant is not None:
            filtered_component_filenames = filter_with_regex(component_filenames, variant_file_re)
-        else:
+
+        # if variant safetensor files do not exist check for non-variants
+        if not filtered_component_filenames:
            filtered_component_filenames = filter_with_regex(component_filenames, non_variant_file_re)
        for component_filename in filtered_component_filenames:
            filename, extension = os.path.splitext(component_filename)
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1748,14 +1748,14 @@ class TorchCompileTesterMixin:
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
-        torch._dynamo.reset()
+        torch.compiler.reset()
        gc.collect()
        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test in case of CUDA runtime errors
        super().tearDown()
-        torch._dynamo.reset()
+        torch.compiler.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -1764,13 +1764,17 @@ class TorchCompileTesterMixin:
    @is_torch_compile
    @slow
    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
+        torch.compiler.reset()
        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

        model = self.model_class(**init_dict).to(torch_device)
        model = torch.compile(model, fullgraph=True)

-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+        with (
+            torch._inductor.utils.fresh_inductor_cache(),
+            torch._dynamo.config.patch(error_on_recompile=True),
+            torch.no_grad(),
+        ):
            _ = model(**inputs_dict)
            _ = model(**inputs_dict)

@@ -1798,7 +1802,7 @@ class LoraHotSwappingForModelTesterMixin:
        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
        # there will be recompilation errors, as torch caches the model when run in the same process.
        super().tearDown()
-        torch._dynamo.reset()
+        torch.compiler.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -1915,7 +1919,7 @@ class LoraHotSwappingForModelTesterMixin:
    def test_hotswapping_compiled_model_linear(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
@@ -1925,7 +1929,7 @@ class LoraHotSwappingForModelTesterMixin:

        # It's important to add this context to raise an error on recompilation
        target_modules = ["conv", "conv1", "conv2"]
-        with torch._dynamo.config.patch(error_on_recompile=True):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
@@ -1935,7 +1939,7 @@ class LoraHotSwappingForModelTesterMixin:

        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "conv"]
-        with torch._dynamo.config.patch(error_on_recompile=True):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
--- a/tests/models/transformers/test_models_transformer_hunyuan_video.py
+++ b/tests/models/transformers/test_models_transformer_hunyuan_video.py
@@ -19,20 +19,16 @@ import torch
 from diffusers import HunyuanVideoTransformer3DModel
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
-    is_torch_compile,
-    require_torch_2,
-    require_torch_gpu,
-    slow,
    torch_device,
 )

-from ..test_modeling_common import ModelTesterMixin
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin


 enable_full_determinism()


-class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -96,23 +92,8 @@ class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-
-class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -179,23 +160,8 @@ class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.T
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-
-class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -260,23 +226,10 @@ class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.Test
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
-
-
-class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(
+    ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase
+):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -342,18 +295,3 @@ class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, u
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
--- a/tests/models/transformers/test_models_transformer_wan.py
+++ b/tests/models/transformers/test_models_transformer_wan.py
@@ -19,20 +19,16 @@ import torch
 from diffusers import WanTransformer3DModel
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
-    is_torch_compile,
-    require_torch_2,
-    require_torch_gpu,
-    slow,
    torch_device,
 )

-from ..test_modeling_common import ModelTesterMixin
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin


 enable_full_determinism()


-class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase):
+class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
    model_class = WanTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -86,18 +82,3 @@ class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"WanTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @require_torch_gpu
-    @require_torch_2
-    @is_torch_compile
-    @slow
-    def test_torch_compile_recompilation_and_graph_break(self):
-        torch._dynamo.reset()
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict).to(torch_device)
-        model = torch.compile(model, fullgraph=True)
-
-        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
-            _ = model(**inputs_dict)
-            _ = model(**inputs_dict)
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -15,7 +15,6 @@

 import gc
 import tempfile
-import traceback
 import unittest

 import numpy as np
@@ -39,13 +38,9 @@ from diffusers.utils.testing_utils import (
    backend_reset_max_memory_allocated,
    backend_reset_peak_memory_stats,
    enable_full_determinism,
-    get_python_version,
-    is_torch_compile,
    load_image,
    load_numpy,
-    require_torch_2,
    require_torch_accelerator,
-    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -68,52 +63,6 @@ from ..test_pipelines_common import (
 enable_full_determinism()


-# Will be run via run_test_in_subprocess
-def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe.unet.to(memory_format=torch.channels_last)
-        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        pipe.controlnet.to(memory_format=torch.channels_last)
-        pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "bird"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        ).resize((512, 512))
-
-        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
-        )
-        expected_image = np.resize(expected_image, (512, 512, 3))
-
-        assert np.abs(expected_image - image).max() < 1.0
-
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
 class ControlNetPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -1053,15 +1002,6 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

-    @is_torch_compile
-    @require_torch_2
-    @unittest.skipIf(
-        get_python_version == (3, 12),
-        reason="Torch Dynamo isn't yet supported for Python 3.12.",
-    )
-    def test_stable_diffusion_compile(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
-
    def test_v11_shuffle_global_pool_conditions(self):
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")

--- a/tests/pipelines/controlnet_xs/test_controlnetxs.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py
@@ -14,7 +14,6 @@
 # limitations under the License.

 import gc
-import traceback
 import unittest

 import numpy as np
@@ -36,13 +35,9 @@ from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
    backend_empty_cache,
    enable_full_determinism,
-    is_torch_compile,
    load_image,
-    load_numpy,
    require_accelerator,
-    require_torch_2,
    require_torch_accelerator,
-    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -78,53 +73,6 @@ def to_np(tensor):
    return tensor


-# Will be run via run_test_in_subprocess
-def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
-    error = None
-    try:
-        _ = in_queue.get(timeout=timeout)
-
-        controlnet = ControlNetXSAdapter.from_pretrained(
-            "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
-        )
-        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1-base",
-            controlnet=controlnet,
-            safety_checker=None,
-            torch_dtype=torch.float16,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe.unet.to(memory_format=torch.channels_last)
-        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "bird"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        ).resize((512, 512))
-
-        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
-        )
-        expected_image = np.resize(expected_image, (512, 512, 3))
-
-        assert np.abs(expected_image - image).max() < 1.0
-
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
 class ControlNetXSPipelineFastTests(
    PipelineLatentTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
@@ -402,8 +350,3 @@ class ControlNetXSPipelineSlowTests(unittest.TestCase):
        original_image = image[-3:, -3:, -1].flatten()
        expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
        assert np.allclose(original_image, expected_image, atol=1e-04)
-
-    @is_torch_compile
-    @require_torch_2
-    def test_stable_diffusion_compile(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -17,7 +17,6 @@
 import gc
 import tempfile
 import time
-import traceback
 import unittest

 import numpy as np
@@ -49,16 +48,12 @@ from diffusers.utils.testing_utils import (
    backend_reset_max_memory_allocated,
    backend_reset_peak_memory_stats,
    enable_full_determinism,
-    is_torch_compile,
-    load_image,
    load_numpy,
    nightly,
    numpy_cosine_similarity_distance,
    require_accelerate_version_greater,
-    require_torch_2,
    require_torch_accelerator,
    require_torch_multi_accelerator,
-    run_test_in_subprocess,
    skip_mps,
    slow,
    torch_device,
@@ -81,39 +76,6 @@ from ..test_pipelines_common import (
 enable_full_determinism()


-# Will be run via run_test_in_subprocess
-def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
-    error = None
-    try:
-        inputs = in_queue.get(timeout=timeout)
-        torch_device = inputs.pop("torch_device")
-        seed = inputs.pop("seed")
-        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
-
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-
-        sd_pipe.unet.to(memory_format=torch.channels_last)
-        sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
-
-        assert np.abs(image_slice - expected_slice).max() < 5e-3
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
 class StableDiffusionPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -1224,40 +1186,6 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 8e-1

-    @is_torch_compile
-    @require_torch_2
-    def test_stable_diffusion_compile(self):
-        seed = 0
-        inputs = self.get_inputs(torch_device, seed=seed)
-        # Can't pickle a Generator object
-        del inputs["generator"]
-        inputs["torch_device"] = torch_device
-        inputs["seed"] = seed
-        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
-
-    def test_stable_diffusion_lcm(self):
-        unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet")
-        sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device)
-        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 6
-        inputs["output_type"] = "pil"
-
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png"
-        )
-
-        image = sd_pipe.image_processor.pil_to_numpy(image)
-        expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)
-
-        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
-
-        assert max_diff < 1e-2
-

@slow
@require_torch_accelerator
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -15,7 +15,6 @@

 import gc
 import random
-import traceback
 import unittest

 import numpy as np
@@ -41,13 +40,10 @@ from diffusers.utils.testing_utils import (
    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
-    is_torch_compile,
    load_image,
    load_numpy,
    nightly,
-    require_torch_2,
    require_torch_accelerator,
-    run_test_in_subprocess,
    skip_mps,
    slow,
    torch_device,
@@ -70,38 +66,6 @@ from ..test_pipelines_common import (
 enable_full_determinism()


-# Will be run via run_test_in_subprocess
-def _test_img2img_compile(in_queue, out_queue, timeout):
-    error = None
-    try:
-        inputs = in_queue.get(timeout=timeout)
-        torch_device = inputs.pop("torch_device")
-        seed = inputs.pop("seed")
-        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
-
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.unet.to(memory_format=torch.channels_last)
-        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
 class StableDiffusionImg2ImgPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -654,17 +618,6 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
        assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
        assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros

-    @is_torch_compile
-    @require_torch_2
-    def test_img2img_compile(self):
-        seed = 0
-        inputs = self.get_inputs(torch_device, seed=seed)
-        # Can't pickle a Generator object
-        del inputs["generator"]
-        inputs["torch_device"] = torch_device
-        inputs["seed"] = seed
-        run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs)
-

@nightly
@require_torch_accelerator
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -15,7 +15,6 @@

 import gc
 import random
-import traceback
 import unittest

 import numpy as np
@@ -44,13 +43,10 @@ from diffusers.utils.testing_utils import (
    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
-    is_torch_compile,
    load_image,
    load_numpy,
    nightly,
-    require_torch_2,
    require_torch_accelerator,
-    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -71,40 +67,6 @@ from ..test_pipelines_common import (
 enable_full_determinism()


-# Will be run via run_test_in_subprocess
-def _test_inpaint_compile(in_queue, out_queue, timeout):
-    error = None
-    try:
-        inputs = in_queue.get(timeout=timeout)
-        torch_device = inputs.pop("torch_device")
-        seed = inputs.pop("seed")
-        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
-
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
-        )
-        pipe.unet.set_default_attn_processor()
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe.unet.to(memory_format=torch.channels_last)
-        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0689, 0.0699, 0.0790, 0.0536, 0.0470, 0.0488, 0.041, 0.0508, 0.04179])
-        assert np.abs(expected_slice - image_slice).max() < 3e-3
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
 class StableDiffusionInpaintPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -727,17 +689,6 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
        # make sure that less than 2.2 GB is allocated
        assert mem_bytes < 2.2 * 10**9

-    @is_torch_compile
-    @require_torch_2
-    def test_inpaint_compile(self):
-        seed = 0
-        inputs = self.get_inputs(torch_device, seed=seed)
-        # Can't pickle a Generator object
-        del inputs["generator"]
-        inputs["torch_device"] = torch_device
-        inputs["seed"] = seed
-        run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs)
-
    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
        pipe = StableDiffusionInpaintPipeline.from_pretrained(
            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
@@ -964,11 +915,6 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
        # make sure that less than 2.45 GB is allocated
        assert mem_bytes < 2.45 * 10**9

-    @is_torch_compile
-    @require_torch_2
-    def test_inpaint_compile(self):
-        pass
-
    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
        vae = AsymmetricAutoencoderKL.from_pretrained(
            "cross-attention/asymmetric-autoencoder-kl-x-1-5",
--- a/tests/pipelines/test_pipeline_utils.py
+++ b/tests/pipelines/test_pipeline_utils.py
@@ -217,6 +217,20 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
        ]
        self.assertFalse(is_safetensors_compatible(filenames))

+    def test_is_compatible_mixed_variants(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+            "vae/diffusion_pytorch_model.safetensors",
+        ]
+        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+
+    def test_is_compatible_variant_and_non_safetensors(self):
+        filenames = [
+            "unet/diffusion_pytorch_model.fp16.safetensors",
+            "vae/diffusion_pytorch_model.bin",
+        ]
+        self.assertFalse(is_safetensors_compatible(filenames, variant="fp16"))
+

 class VariantCompatibleSiblingsTest(unittest.TestCase):
    def test_only_non_variants_downloaded(self):
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -538,38 +538,26 @@ class DownloadTests(unittest.TestCase):
            variant = "no_ema"

            with tempfile.TemporaryDirectory() as tmpdirname:
-                if use_safetensors:
-                    with self.assertRaises(OSError) as error_context:
-                        tmpdirname = StableDiffusionPipeline.download(
-                            "hf-internal-testing/stable-diffusion-all-variants",
-                            cache_dir=tmpdirname,
-                            variant=variant,
-                            use_safetensors=use_safetensors,
-                        )
-                    assert "Could not find the necessary `safetensors` weights" in str(error_context.exception)
-                else:
-                    tmpdirname = StableDiffusionPipeline.download(
-                        "hf-internal-testing/stable-diffusion-all-variants",
-                        cache_dir=tmpdirname,
-                        variant=variant,
-                        use_safetensors=use_safetensors,
-                    )
-                    all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-                    files = [item for sublist in all_root_files for item in sublist]
+                tmpdirname = StableDiffusionPipeline.download(
+                    "hf-internal-testing/stable-diffusion-all-variants",
+                    cache_dir=tmpdirname,
+                    variant=variant,
+                    use_safetensors=use_safetensors,
+                )
+                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
+                files = [item for sublist in all_root_files for item in sublist]

-                    unet_files = os.listdir(os.path.join(tmpdirname, "unet"))
+                unet_files = os.listdir(os.path.join(tmpdirname, "unet"))

-                    # Some of the downloaded files should be a non-variant file, check:
-                    # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
-                    assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                    # only unet has "no_ema" variant
-                    assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files
-                    assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1
-                    # vae, safety_checker and text_encoder should have no variant
-                    assert (
-                        sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
-                    )
-                    assert not any(f.endswith(other_format) for f in files)
+                # Some of the downloaded files should be a non-variant file, check:
+                # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
+                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
+                # only unet has "no_ema" variant
+                assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files
+                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1
+                # vae, safety_checker and text_encoder should have no variant
+                assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
+                assert not any(f.endswith(other_format) for f in files)

    def test_download_variants_with_sharded_checkpoints(self):
        # Here we test for downloading of "variant" files belonging to the `unet` and
@@ -2006,7 +1994,9 @@ class PipelineSlowTests(unittest.TestCase):
        reason="Torch Dynamo isn't yet supported for Python 3.12.",
    )
    def test_from_save_pretrained_dynamo(self):
-        run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)
+        torch.compiler.rest()
+        with torch._inductor.utils.fresh_inductor_cache():
+            run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)

    def test_from_pretrained_hub(self):
        model_path = "google/ddpm-cifar10-32"
@@ -2218,7 +2208,7 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
        # there will be recompilation errors, as torch caches the model when run in the same process.
        super().tearDown()
-        torch._dynamo.reset()
+        torch.compiler.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -2343,21 +2333,21 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
    def test_hotswapping_compiled_pipline_linear(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_pipline_conv2d(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["conv", "conv1", "conv2"]
-        with torch._dynamo.config.patch(error_on_recompile=True):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "conv"]
-        with torch._dynamo.config.patch(error_on_recompile=True):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1111,14 +1111,14 @@ class PipelineTesterMixin:
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
-        torch._dynamo.reset()
+        torch.compiler.reset()
        gc.collect()
        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test in case of CUDA runtime errors
        super().tearDown()
-        torch._dynamo.reset()
+        torch.compiler.reset()
        gc.collect()
        backend_empty_cache(torch_device)

--- a/tests/pipelines/unidiffuser/test_unidiffuser.py
+++ b/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -1,6 +1,5 @@
 import gc
 import random
-import traceback
 import unittest

 import numpy as np
@@ -27,9 +26,7 @@ from diffusers.utils.testing_utils import (
    floats_tensor,
    load_image,
    nightly,
-    require_torch_2,
    require_torch_accelerator,
-    run_test_in_subprocess,
    torch_device,
 )
 from diffusers.utils.torch_utils import randn_tensor
@@ -45,38 +42,6 @@ from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, Pipeline
 enable_full_determinism()


-# Will be run via run_test_in_subprocess
-def _test_unidiffuser_compile(in_queue, out_queue, timeout):
-    error = None
-    try:
-        inputs = in_queue.get(timeout=timeout)
-        torch_device = inputs.pop("torch_device")
-        seed = inputs.pop("seed")
-        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
-
-        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
-        # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe = pipe.to(torch_device)
-
-        pipe.unet.to(memory_format=torch.channels_last)
-        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
-        assert np.abs(image_slice - expected_slice).max() < 1e-1
-    except Exception:
-        error = f"{traceback.format_exc()}"
-
-    results = {"error": error}
-    out_queue.put(results, timeout=timeout)
-    out_queue.join()
-
-
 class UniDiffuserPipelineFastTests(
    PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
 ):
@@ -690,19 +655,6 @@ class UniDiffuserPipelineSlowTests(unittest.TestCase):
        expected_text_prefix = "An astronaut"
        assert text[0][: len(expected_text_prefix)] == expected_text_prefix

-    @unittest.skip(reason="Skip torch.compile test to speed up the slow test suite.")
-    @require_torch_2
-    def test_unidiffuser_compile(self, seed=0):
-        inputs = self.get_inputs(torch_device, seed=seed, generate_latents=True)
-        # Delete prompt and image for joint inference.
-        del inputs["prompt"]
-        del inputs["image"]
-        # Can't pickle a Generator object
-        del inputs["generator"]
-        inputs["torch_device"] = torch_device
-        inputs["seed"] = seed
-        run_test_in_subprocess(test_case=self, target_func=_test_unidiffuser_compile, inputs=inputs)
-

@nightly
@require_torch_accelerator
Author	SHA1	Message	Date
DN6	7056cd943e	update	2025-05-26 22:14:28 +05:30
Dhruv Nair	826f43505d	Fix mixed variant downloading (#11611 ) * update * update	2025-05-26 21:43:48 +05:30
Sayak Paul	4af76d0d7d	[tests] Changes to the `torch.compile()` CI and tests (#11508 ) * remove compile cuda docker. * replace compile cuda docker path. * better manage compilation cache. * propagate similar to the pipeline tests. * remove unneeded compile test. * small. * don't check for deleted files.	2025-05-26 08:31:04 -07:00