Revert "[tests] Changes to the torch.compile() CI and tests (#11508 )"

This reverts commit 4af76d0d7d.
2025-12-06 20:44:33 +08:00 · 2025-05-26 21:47:06 +05:30
17 changed files with 498 additions and 41 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,7 +23,7 @@ jobs:
    runs-on:
      group: aws-g6-4xlarge-plus
    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
    steps:
      - name: Checkout diffusers
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -41,12 +41,6 @@ jobs:
        run: |
          CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
          for FILE in $CHANGED_FILES; do
-            # skip anything that isn’t still on disk
-            if [[ ! -f "$FILE" ]]; then
-              echo "Skipping removed file $FILE"
-              continue
-            fi
-            
            if [[ "$FILE" == docker/*Dockerfile ]]; then
              DOCKER_PATH="${FILE%/Dockerfile}"
              DOCKER_TAG=$(basename "$DOCKER_PATH")
@@ -71,7 +65,7 @@ jobs:
        image-name:
          - diffusers-pytorch-cpu
          - diffusers-pytorch-cuda
-          - diffusers-pytorch-cuda
+          - diffusers-pytorch-compile-cuda
          - diffusers-pytorch-xformers-cuda
          - diffusers-pytorch-minimum-cuda
          - diffusers-flax-cpu
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -188,7 +188,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -262,7 +262,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -316,7 +316,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/docker/diffusers-pytorch-compile-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -0,0 +1,50 @@
+FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libsndfile1-dev \
+    libgl1 \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    invisible_watermark && \
+    python3.10 -m pip install --no-cache-dir \
+    accelerate \
+    datasets \
+    hf-doc-builder \
+    huggingface-hub \
+    hf_transfer \
+    Jinja2 \
+    librosa \
+    numpy==1.26.4 \
+    scipy \
+    tensorboard \
+    transformers \
+    hf_transfer
+
+CMD ["/bin/bash"]
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1748,14 +1748,14 @@ class TorchCompileTesterMixin:
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test in case of CUDA runtime errors
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -1764,17 +1764,13 @@ class TorchCompileTesterMixin:
    @is_torch_compile
    @slow
    def test_torch_compile_recompilation_and_graph_break(self):
-        torch.compiler.reset()
+        torch._dynamo.reset()
        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

        model = self.model_class(**init_dict).to(torch_device)
        model = torch.compile(model, fullgraph=True)

-        with (
-            torch._inductor.utils.fresh_inductor_cache(),
-            torch._dynamo.config.patch(error_on_recompile=True),
-            torch.no_grad(),
-        ):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
            _ = model(**inputs_dict)
            _ = model(**inputs_dict)

@@ -1802,7 +1798,7 @@ class LoraHotSwappingForModelTesterMixin:
        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
        # there will be recompilation errors, as torch caches the model when run in the same process.
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -1919,7 +1915,7 @@ class LoraHotSwappingForModelTesterMixin:
    def test_hotswapping_compiled_model_linear(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
@@ -1929,7 +1925,7 @@ class LoraHotSwappingForModelTesterMixin:

        # It's important to add this context to raise an error on recompilation
        target_modules = ["conv", "conv1", "conv2"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
@@ -1939,7 +1935,7 @@ class LoraHotSwappingForModelTesterMixin:

        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "conv"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
--- a/tests/models/transformers/test_models_transformer_hunyuan_video.py
+++ b/tests/models/transformers/test_models_transformer_hunyuan_video.py
@@ -19,16 +19,20 @@ import torch
 from diffusers import HunyuanVideoTransformer3DModel
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
+    is_torch_compile,
+    require_torch_2,
+    require_torch_gpu,
+    slow,
    torch_device,
 )

-from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+from ..test_modeling_common import ModelTesterMixin


 enable_full_determinism()


-class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -92,8 +96,23 @@ class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin,
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -160,8 +179,23 @@ class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompi
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -226,10 +260,23 @@ class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileT
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(
-    ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase
-):
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -295,3 +342,18 @@ class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
--- a/tests/models/transformers/test_models_transformer_wan.py
+++ b/tests/models/transformers/test_models_transformer_wan.py
@@ -19,16 +19,20 @@ import torch
 from diffusers import WanTransformer3DModel
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
+    is_torch_compile,
+    require_torch_2,
+    require_torch_gpu,
+    slow,
    torch_device,
 )

-from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+from ..test_modeling_common import ModelTesterMixin


 enable_full_determinism()


-class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = WanTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -82,3 +86,18 @@ class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"WanTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -15,6 +15,7 @@

 import gc
 import tempfile
+import traceback
 import unittest

 import numpy as np
@@ -38,9 +39,13 @@ from diffusers.utils.testing_utils import (
    backend_reset_max_memory_allocated,
    backend_reset_peak_memory_stats,
    enable_full_determinism,
+    get_python_version,
+    is_torch_compile,
    load_image,
    load_numpy,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -63,6 +68,52 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        _ = in_queue.get(timeout=timeout)
+
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.controlnet.to(memory_format=torch.channels_last)
+        pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+        )
+        expected_image = np.resize(expected_image, (512, 512, 3))
+
+        assert np.abs(expected_image - image).max() < 1.0
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class ControlNetPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -1002,6 +1053,15 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

+    @is_torch_compile
+    @require_torch_2
+    @unittest.skipIf(
+        get_python_version == (3, 12),
+        reason="Torch Dynamo isn't yet supported for Python 3.12.",
+    )
+    def test_stable_diffusion_compile(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
+
    def test_v11_shuffle_global_pool_conditions(self):
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")

--- a/tests/pipelines/controlnet_xs/test_controlnetxs.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py
@@ -14,6 +14,7 @@
 # limitations under the License.

 import gc
+import traceback
 import unittest

 import numpy as np
@@ -35,9 +36,13 @@ from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
    backend_empty_cache,
    enable_full_determinism,
+    is_torch_compile,
    load_image,
+    load_numpy,
    require_accelerator,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -73,6 +78,53 @@ def to_np(tensor):
    return tensor


+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        _ = in_queue.get(timeout=timeout)
+
+        controlnet = ControlNetXSAdapter.from_pretrained(
+            "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base",
+            controlnet=controlnet,
+            safety_checker=None,
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+        )
+        expected_image = np.resize(expected_image, (512, 512, 3))
+
+        assert np.abs(expected_image - image).max() < 1.0
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class ControlNetXSPipelineFastTests(
    PipelineLatentTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
@@ -350,3 +402,8 @@ class ControlNetXSPipelineSlowTests(unittest.TestCase):
        original_image = image[-3:, -3:, -1].flatten()
        expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+    @is_torch_compile
+    @require_torch_2
+    def test_stable_diffusion_compile(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -17,6 +17,7 @@
 import gc
 import tempfile
 import time
+import traceback
 import unittest

 import numpy as np
@@ -48,12 +49,16 @@ from diffusers.utils.testing_utils import (
    backend_reset_max_memory_allocated,
    backend_reset_peak_memory_stats,
    enable_full_determinism,
+    is_torch_compile,
+    load_image,
    load_numpy,
    nightly,
    numpy_cosine_similarity_distance,
    require_accelerate_version_greater,
+    require_torch_2,
    require_torch_accelerator,
    require_torch_multi_accelerator,
+    run_test_in_subprocess,
    skip_mps,
    slow,
    torch_device,
@@ -76,6 +81,39 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+
+        sd_pipe.unet.to(memory_format=torch.channels_last)
+        sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+
+        assert np.abs(image_slice - expected_slice).max() < 5e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class StableDiffusionPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -1186,6 +1224,40 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 8e-1

+    @is_torch_compile
+    @require_torch_2
+    def test_stable_diffusion_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
+
+    def test_stable_diffusion_lcm(self):
+        unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 6
+        inputs["output_type"] = "pil"
+
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png"
+        )
+
+        image = sd_pipe.image_processor.pil_to_numpy(image)
+        expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+
+        assert max_diff < 1e-2
+

@slow
@require_torch_accelerator
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -15,6 +15,7 @@

 import gc
 import random
+import traceback
 import unittest

 import numpy as np
@@ -40,10 +41,13 @@ from diffusers.utils.testing_utils import (
    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
+    is_torch_compile,
    load_image,
    load_numpy,
    nightly,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    skip_mps,
    slow,
    torch_device,
@@ -66,6 +70,38 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_img2img_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class StableDiffusionImg2ImgPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -618,6 +654,17 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
        assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
        assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros

+    @is_torch_compile
+    @require_torch_2
+    def test_img2img_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs)
+

@nightly
@require_torch_accelerator
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -15,6 +15,7 @@

 import gc
 import random
+import traceback
 import unittest

 import numpy as np
@@ -43,10 +44,13 @@ from diffusers.utils.testing_utils import (
    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
+    is_torch_compile,
    load_image,
    load_numpy,
    nightly,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -67,6 +71,40 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_inpaint_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0689, 0.0699, 0.0790, 0.0536, 0.0470, 0.0488, 0.041, 0.0508, 0.04179])
+        assert np.abs(expected_slice - image_slice).max() < 3e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class StableDiffusionInpaintPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -689,6 +727,17 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
        # make sure that less than 2.2 GB is allocated
        assert mem_bytes < 2.2 * 10**9

+    @is_torch_compile
+    @require_torch_2
+    def test_inpaint_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs)
+
    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
        pipe = StableDiffusionInpaintPipeline.from_pretrained(
            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
@@ -915,6 +964,11 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
        # make sure that less than 2.45 GB is allocated
        assert mem_bytes < 2.45 * 10**9

+    @is_torch_compile
+    @require_torch_2
+    def test_inpaint_compile(self):
+        pass
+
    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
        vae = AsymmetricAutoencoderKL.from_pretrained(
            "cross-attention/asymmetric-autoencoder-kl-x-1-5",
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -1994,9 +1994,7 @@ class PipelineSlowTests(unittest.TestCase):
        reason="Torch Dynamo isn't yet supported for Python 3.12.",
    )
    def test_from_save_pretrained_dynamo(self):
-        torch.compiler.rest()
-        with torch._inductor.utils.fresh_inductor_cache():
-            run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)
+        run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)

    def test_from_pretrained_hub(self):
        model_path = "google/ddpm-cifar10-32"
@@ -2208,7 +2206,7 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
        # there will be recompilation errors, as torch caches the model when run in the same process.
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -2333,21 +2331,21 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
    def test_hotswapping_compiled_pipline_linear(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_pipline_conv2d(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["conv", "conv1", "conv2"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "conv"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1111,14 +1111,14 @@ class PipelineTesterMixin:
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test in case of CUDA runtime errors
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

--- a/tests/pipelines/unidiffuser/test_unidiffuser.py
+++ b/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -1,5 +1,6 @@
 import gc
 import random
+import traceback
 import unittest

 import numpy as np
@@ -26,7 +27,9 @@ from diffusers.utils.testing_utils import (
    floats_tensor,
    load_image,
    nightly,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    torch_device,
 )
 from diffusers.utils.torch_utils import randn_tensor
@@ -42,6 +45,38 @@ from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, Pipeline
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_unidiffuser_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
+        # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(torch_device)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
+        assert np.abs(image_slice - expected_slice).max() < 1e-1
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class UniDiffuserPipelineFastTests(
    PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
 ):
@@ -655,6 +690,19 @@ class UniDiffuserPipelineSlowTests(unittest.TestCase):
        expected_text_prefix = "An astronaut"
        assert text[0][: len(expected_text_prefix)] == expected_text_prefix

+    @unittest.skip(reason="Skip torch.compile test to speed up the slow test suite.")
+    @require_torch_2
+    def test_unidiffuser_compile(self, seed=0):
+        inputs = self.get_inputs(torch_device, seed=seed, generate_latents=True)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_unidiffuser_compile, inputs=inputs)
+

@nightly
@require_torch_accelerator