update

2025-12-12 23:44:30 +08:00 · 2025-05-22 21:46:19 +05:30
39 changed files with 753 additions and 430 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -23,7 +23,7 @@ jobs:
    runs-on:
      group: aws-g6-4xlarge-plus
    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
    steps:
      - name: Checkout diffusers
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -38,16 +38,9 @@ jobs:
          token: ${{ secrets.GITHUB_TOKEN }}

      - name: Build Changed Docker Images
-        env:
-          CHANGED_FILES:  "${{ steps.file_changes.outputs.all }}"
        run: |
+          CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
          for FILE in $CHANGED_FILES; do
-            # skip anything that isn’t still on disk
-            if [[ ! -f "$FILE" ]]; then
-              echo "Skipping removed file $FILE"
-              continue
-            fi
-            
            if [[ "$FILE" == docker/*Dockerfile ]]; then
              DOCKER_PATH="${FILE%/Dockerfile}"
              DOCKER_TAG=$(basename "$DOCKER_PATH")
@@ -72,7 +65,7 @@ jobs:
        image-name:
          - diffusers-pytorch-cpu
          - diffusers-pytorch-cuda
-          - diffusers-pytorch-cuda
+          - diffusers-pytorch-compile-cuda
          - diffusers-pytorch-xformers-cuda
          - diffusers-pytorch-minimum-cuda
          - diffusers-flax-cpu
--- a/.github/workflows/nightly_tests.yml
+++ b/.github/workflows/nightly_tests.yml
@@ -188,7 +188,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -262,7 +262,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/.github/workflows/release_tests_fast.yml
+++ b/.github/workflows/release_tests_fast.yml
@@ -316,7 +316,7 @@ jobs:
      group: aws-g4dn-2xlarge

    container:
-      image: diffusers/diffusers-pytorch-cuda
+      image: diffusers/diffusers-pytorch-compile-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host

    steps:
--- a/docker/diffusers-pytorch-compile-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -0,0 +1,50 @@
+FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get -y update \
+    && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt install -y bash \
+    build-essential \
+    git \
+    git-lfs \
+    curl \
+    ca-certificates \
+    libsndfile1-dev \
+    libgl1 \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3.10-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.10 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
+    python3.10 -m uv pip install --no-cache-dir \
+    torch \
+    torchvision \
+    torchaudio \
+    invisible_watermark && \
+    python3.10 -m pip install --no-cache-dir \
+    accelerate \
+    datasets \
+    hf-doc-builder \
+    huggingface-hub \
+    hf_transfer \
+    Jinja2 \
+    librosa \
+    numpy==1.26.4 \
+    scipy \
+    tensorboard \
+    transformers \
+    hf_transfer
+
+CMD ["/bin/bash"]
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -208,7 +208,7 @@
    - local: optimization/mps
      title: Metal Performance Shaders (MPS)
    - local: optimization/habana
-      title: Intel Gaudi
+      title: Habana Gaudi
    - local: optimization/neuron
      title: AWS Neuron
    title: Optimized hardware
--- a/docs/source/en/optimization/habana.md
+++ b/docs/source/en/optimization/habana.md
@@ -10,22 +10,67 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Intel Gaudi
+# Habana Gaudi

-The Intel Gaudi AI accelerator family includes [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html) overview.
+🤗 Diffusers is compatible with Habana Gaudi through 🤗 [Optimum](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion). Follow the [installation](https://docs.habana.ai/en/latest/Installation_Guide/index.html) guide to install the SynapseAI and Gaudi drivers, and then install Optimum Habana:

-Diffusers pipelines can take advantage of HPU acceleration, even if a pipeline hasn't been added to [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index) yet, with the [GPU Migration Toolkit](https://docs.habana.ai/en/latest/PyTorch/PyTorch_Model_Porting/GPU_Migration_Toolkit/GPU_Migration_Toolkit.html).
-
-Call `.to("hpu")` on your pipeline to move it to a HPU device as shown below for Flux:
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
-pipeline.to("hpu")
-
-image = pipeline("An image of a squirrel in Picasso style").images[0]
+```bash
+python -m pip install --upgrade-strategy eager optimum[habana]
 ```

-> [!TIP]
-> For Gaudi-optimized diffusion pipeline implementations, we recommend using [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index).
+To generate images with Stable Diffusion 1 and 2 on Gaudi, you need to instantiate two instances:
+
+- [`~optimum.habana.diffusers.GaudiStableDiffusionPipeline`], a pipeline for text-to-image generation.
+- [`~optimum.habana.diffusers.GaudiDDIMScheduler`], a Gaudi-optimized scheduler.
+
+When you initialize the pipeline, you have to specify `use_habana=True` to deploy it on HPUs and to get the fastest possible generation, you should enable **HPU graphs** with `use_hpu_graphs=True`.
+
+Finally, specify a [`~optimum.habana.GaudiConfig`] which can be downloaded from the [Habana](https://huggingface.co/Habana) organization on the Hub.
+
+```python
+from optimum.habana import GaudiConfig
+from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
+
+model_name = "stabilityai/stable-diffusion-2-base"
+scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
+pipeline = GaudiStableDiffusionPipeline.from_pretrained(
+    model_name,
+    scheduler=scheduler,
+    use_habana=True,
+    use_hpu_graphs=True,
+    gaudi_config="Habana/stable-diffusion-2",
+)
+```
+
+Now you can call the pipeline to generate images by batches from one or several prompts:
+
+```python
+outputs = pipeline(
+    prompt=[
+        "High quality photo of an astronaut riding a horse in space",
+        "Face of a yellow cat, high resolution, sitting on a park bench",
+    ],
+    num_images_per_prompt=10,
+    batch_size=4,
+)
+```
+
+For more information, check out 🤗 Optimum Habana's [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion) and the [example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion) provided in the official GitHub repository.
+
+## Benchmark
+
+We benchmarked Habana's first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) and [Habana/stable-diffusion-2](https://huggingface.co/Habana/stable-diffusion-2) Gaudi configurations (mixed precision bf16/fp32) to demonstrate their performance.
+
+For [Stable Diffusion v1.5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) on 512x512 images:
+
+|                        | Latency (batch size = 1) | Throughput  |
+| ---------------------- |:------------------------:|:---------------------------:|
+| first-generation Gaudi | 3.80s                    | 0.308 images/s (batch size = 8)             |
+| Gaudi2                 | 1.33s                    | 1.081 images/s (batch size = 8)             |
+
+For [Stable Diffusion v2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) on 768x768 images:
+
+|                        | Latency (batch size = 1) | Throughput                      |
+| ---------------------- |:------------------------:|:-------------------------------:|
+| first-generation Gaudi | 10.2s                    | 0.108 images/s (batch size = 4) |
+| Gaudi2                 | 3.17s                    | 0.379 images/s (batch size = 8) |
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -13,30 +13,80 @@ specific language governing permissions and limitations under the License.

 # Quantization

-Quantization focuses on representing data with fewer bits while also trying to preserve the precision of the original data. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
+Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.

-Diffusers supports multiple quantization backends to make large diffusion models like [Flux](../api/pipelines/flux) more accessible. This guide shows how to use the [`~quantizers.PipelineQuantizationConfig`] class to quantize a pipeline during its initialization from a pretrained or non-quantized checkpoint.
+<Tip>
+
+Interested in adding a new quantization method to Diffusers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method.
+
+</Tip>
+
+<Tip>
+
+If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization in collaboration with DeepLearning.AI:
+
+* [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
+* [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
+
+</Tip>
+
+## When to use what?
+
+Diffusers currently supports the following quantization methods.
+- [BitsandBytes](./bitsandbytes)
+- [TorchAO](./torchao)
+- [GGUF](./gguf)
+- [Quanto](./quanto.md)
+
+[This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.

 ## Pipeline-level quantization

-There are two ways you can use [`~quantizers.PipelineQuantizationConfig`] depending on the level of control you want over the quantization specifications of each model in the pipeline.
+Diffusers allows users to directly initialize pipelines from checkpoints that may contain quantized models ([example](https://huggingface.co/hf-internal-testing/flux.1-dev-nf4-pkg)). However, users may want to apply
+quantization on-the-fly when initializing a pipeline from a pre-trained and non-quantized checkpoint. You can
+do this with [`~quantizers.PipelineQuantizationConfig`].

- for more basic and simple use cases, you only need to define the `quant_backend`, `quant_kwargs`, and `components_to_quantize`
- for more granular quantization control, provide a `quant_mapping` that provides the quantization specifications for the individual model components
-
-### Simple quantization
-
-Initialize [`~quantizers.PipelineQuantizationConfig`] with the following parameters.
-
- `quant_backend` specifies which quantization backend to use. Currently supported backends include: `bitsandbytes_4bit`, `bitsandbytes_8bit`, `gguf`, `quanto`, and `torchao`.
- `quant_kwargs` contains the specific quantization arguments to use.
- `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
+Start by defining a `PipelineQuantizationConfig`:

 ```py
 import torch
 from diffusers import DiffusionPipeline
+from diffusers.quantizers.quantization_config import QuantoConfig
 from diffusers.quantizers import PipelineQuantizationConfig
+from transformers import BitsAndBytesConfig

+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_mapping={
+        "transformer": QuantoConfig(weights_dtype="int8"),
+        "text_encoder_2": BitsAndBytesConfig(
+            load_in_4bit=True, compute_dtype=torch.bfloat16
+        ),
+    }
+)
+```
+
+Then pass it to [`~DiffusionPipeline.from_pretrained`] and run inference:
+
+```py
+pipe = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    quantization_config=pipeline_quant_config,
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+
+image = pipe("photo of a cute dog").images[0]
+```
+
+This method allows for more granular control over the quantization specifications of individual 
+model-level components of a pipeline. It also allows for different quantization backends for
+different components. In the above example, you used a combination of Quanto and BitsandBytes. However,
+one caveat of this method is that users need to know which components come from `transformers` to be able
+to import the right quantization config class.
+
+The other method is simpler in terms of experience but is
+less-flexible. Start by defining a `PipelineQuantizationConfig` but in a different way:
+
+```py
 pipeline_quant_config = PipelineQuantizationConfig(
    quant_backend="bitsandbytes_4bit",
    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
@@ -44,89 +94,35 @@ pipeline_quant_config = PipelineQuantizationConfig(
 )
 ```

-Pass the `pipeline_quant_config` to [`~DiffusionPipeline.from_pretrained`] to quantize the pipeline.
+This `pipeline_quant_config` can now be passed to [`~DiffusionPipeline.from_pretrained`] similar to the above example.
+
+In this case, `quant_kwargs` will be used to initialize the quantization specifications
+of the respective quantization configuration class of `quant_backend`. `components_to_quantize`
+is used to denote the components that will be quantized. For most pipelines, you would want to
+keep `transformer` in the list as that is often the most compute and memory intensive.
+
+The config below will work for most diffusion pipelines that have a `transformer` component present.
+In most case, you will want to quantize the `transformer` component as that is often the most compute-
+intensive part of a diffusion pipeline.

 ```py
-pipe = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantization_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-image = pipe("photo of a cute dog").images[0]
-```
-
-### quant_mapping
-
-The `quant_mapping` argument provides more flexible options for how to quantize each individual component in a pipeline, like combining different quantization backends.
-
-Initialize [`~quantizers.PipelineQuantizationConfig`] and pass a `quant_mapping` to it. The `quant_mapping` allows you to specify the quantization options for each component in the pipeline such as the transformer and text encoder.
-
-The example below uses two quantization backends, [`~quantizers.QuantoConfig`] and [`transformers.BitsAndBytesConfig`], for the transformer and text encoder.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
-from diffusers.quantizers.quantization_config import QuantoConfig
-from diffusers.quantizers import PipelineQuantizationConfig
-from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
-
 pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={
-        "transformer": QuantoConfig(weights_dtype="int8"),
-        "text_encoder_2": TransformersBitsAndBytesConfig(
-            load_in_4bit=True, compute_dtype=torch.bfloat16
-        ),
-    }
+    quant_backend="bitsandbytes_4bit",
+    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
+    components_to_quantize=["transformer"],
 )
 ```

-There is a separate bitsandbytes backend in [Transformers](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig). You need to import and use [`transformers.BitsAndBytesConfig`] for components that come from Transformers. For example, `text_encoder_2` in [`FluxPipeline`] is a [`~transformers.T5EncoderModel`] from Transformers so you need to use [`transformers.BitsAndBytesConfig`] instead of [`diffusers.BitsAndBytesConfig`].
+Below is a list of the supported quantization backends available in both `diffusers` and `transformers`:

-> [!TIP]
-> Use the [simple quantization](#simple-quantization) method above if you don't want to manage these distinct imports or aren't sure where each pipeline component comes from.
+* `bitsandbytes_4bit` 
+* `bitsandbytes_8bit`
+* `gguf`
+* `quanto`
+* `torchao`

-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
-from diffusers.quantizers import PipelineQuantizationConfig
-from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig

-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={
-        "transformer": DiffusersBitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16),
-        "text_encoder_2": TransformersBitsAndBytesConfig(
-            load_in_4bit=True, compute_dtype=torch.bfloat16
-        ),
-    }
-)
-```
-
-Pass the `pipeline_quant_config` to [`~DiffusionPipeline.from_pretrained`] to quantize the pipeline.
-
-```py
-pipe = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantization_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-image = pipe("photo of a cute dog").images[0]
-```
-
-## Resources
-
-Check out the resources below to learn more about quantization.
-
- If you are new to quantization, we recommend checking out the following beginner-friendly courses in collaboration with DeepLearning.AI.
-
-    - [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
-    - [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
-
- Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) if you're interested in adding a new quantization method.
-
- The Transformers quantization [Overview](https://huggingface.co/docs/transformers/quantization/overview#when-to-use-what) provides an overview of the pros and cons of different quantization backends.
-
- Read the [Exploring Quantization Backends in Diffusers](https://huggingface.co/blog/diffusers-quantization) blog post for a brief introduction to each quantization backend, how to choose a backend, and combining quantization with other memory optimizations.
+Diffusion pipelines can have multiple text encoders. [`FluxPipeline`] has two, for example. It's
+recommended to quantize the text encoders that are memory-intensive. Some examples include T5,
+Llama, Gemma, etc. In the above example, you quantized the T5 model of [`FluxPipeline`] through
+`text_encoder_2` while keeping the CLIP model intact (accessible through `text_encoder`). 
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@@ -175,7 +175,7 @@
    - local: optimization/mps
      title: Metal Performance Shaders (MPS)
    - local: optimization/habana
-      title: Intel Gaudi
+      title: Habana Gaudi
    title: 최적화된 하드웨어
  title: 추론 가속화와 메모리 줄이기
 - sections:
--- a/docs/source/ko/optimization/habana.md
+++ b/docs/source/ko/optimization/habana.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Intel Gaudi에서 Stable Diffusion을 사용하는 방법
+# Habana Gaudi에서 Stable Diffusion을 사용하는 방법

 🤗 Diffusers는 🤗 [Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion)를 통해서 Habana Gaudi와 호환됩니다.

--- a/src/diffusers/models/auto_model.py
+++ b/src/diffusers/models/auto_model.py
@@ -12,16 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import importlib
 import os
 from typing import Optional, Union

 from huggingface_hub.utils import validate_hf_hub_args

 from ..configuration_utils import ConfigMixin
-from ..utils import logging
-
-
-logger = logging.get_logger(__name__)


 class AutoModel(ConfigMixin):
@@ -155,50 +152,15 @@ class AutoModel(ConfigMixin):
            "token": token,
            "local_files_only": local_files_only,
            "revision": revision,
+            "subfolder": subfolder,
        }

-        library = None
-        orig_class_name = None
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        orig_class_name = config["_class_name"]

-        # Always attempt to fetch model_index.json first
-        try:
-            cls.config_name = "model_index.json"
-            config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
-
-            if subfolder is not None and subfolder in config:
-                library, orig_class_name = config[subfolder]
-                load_config_kwargs.update({"subfolder": subfolder})
-
-        except EnvironmentError as e:
-            logger.debug(e)
-
-        # Unable to load from model_index.json so fallback to loading from config
-        if library is None and orig_class_name is None:
-            cls.config_name = "config.json"
-            config = cls.load_config(pretrained_model_or_path, subfolder=subfolder, **load_config_kwargs)
-
-            if "_class_name" in config:
-                # If we find a class name in the config, we can try to load the model as a diffusers model
-                orig_class_name = config["_class_name"]
-                library = "diffusers"
-                load_config_kwargs.update({"subfolder": subfolder})
-            elif "model_type" in config:
-                orig_class_name = "AutoModel"
-                library = "transformers"
-                load_config_kwargs.update({"subfolder": "" if subfolder is None else subfolder})
-            else:
-                raise ValueError(f"Couldn't find model associated with the config file at {pretrained_model_or_path}.")
-
-        from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates
-
-        model_cls, _ = get_class_obj_and_candidates(
-            library_name=library,
-            class_name=orig_class_name,
-            importable_classes=ALL_IMPORTABLE_CLASSES,
-            pipelines=None,
-            is_pipeline_module=False,
-        )
+        library = importlib.import_module("diffusers")

+        model_cls = getattr(library, orig_class_name, None)
        if model_cls is None:
            raise ValueError(f"AutoModel can't find a model linked to {orig_class_name}.")

--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -92,7 +92,7 @@ for library in LOADABLE_CLASSES:
    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])


-def is_safetensors_compatible(filenames, passed_components=None, folder_names=None, variant=None) -> bool:
+def is_safetensors_compatible(filenames, passed_components=None, folder_names=None) -> bool:
    """
    Checking for safetensors compatibility:
    - The model is safetensors compatible only if there is a safetensors file for each model component present in
@@ -103,31 +103,6 @@ def is_safetensors_compatible(filenames, passed_components=None, folder_names=No
    - For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
      extension is replaced with ".safetensors"
    """
-    weight_names = [
-        WEIGHTS_NAME,
-        SAFETENSORS_WEIGHTS_NAME,
-        FLAX_WEIGHTS_NAME,
-        ONNX_WEIGHTS_NAME,
-        ONNX_EXTERNAL_WEIGHTS_NAME,
-    ]
-
-    if is_transformers_available():
-        weight_names += [TRANSFORMERS_WEIGHTS_NAME, TRANSFORMERS_SAFE_WEIGHTS_NAME, TRANSFORMERS_FLAX_WEIGHTS_NAME]
-
-    # model_pytorch, diffusion_model_pytorch, ...
-    weight_prefixes = [w.split(".")[0] for w in weight_names]
-    # .bin, .safetensors, ...
-    weight_suffixs = [w.split(".")[-1] for w in weight_names]
-    # -00001-of-00002
-    transformers_index_format = r"\d{5}-of-\d{5}"
-    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
-    variant_file_re = re.compile(
-        rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
-    )
-    non_variant_file_re = re.compile(
-        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
-    )
-
    passed_components = passed_components or []
    if folder_names:
        filenames = {f for f in filenames if os.path.split(f)[0] in folder_names}
@@ -146,29 +121,15 @@ def is_safetensors_compatible(filenames, passed_components=None, folder_names=No
        components[component].append(component_filename)

    # If there are no component folders check the main directory for safetensors files
-    filtered_filenames = set()
    if not components:
-        if variant is not None:
-            filtered_filenames = filter_with_regex(filenames, variant_file_re)
-
-        # If no variant filenames exist check if non-variant files are available
-        if not filtered_filenames:
-            filtered_filenames = filter_with_regex(filenames, non_variant_file_re)
-        return any(".safetensors" in filename for filename in filtered_filenames)
+        return any(".safetensors" in filename for filename in filenames)

    # iterate over all files of a component
    # check if safetensor files exist for that component
+    # if variant is provided check if the variant of the safetensors exists
    for component, component_filenames in components.items():
        matches = []
-        filtered_component_filenames = set()
-        # if variant is provided check if the variant of the safetensors exists
-        if variant is not None:
-            filtered_component_filenames = filter_with_regex(component_filenames, variant_file_re)
-
-        # if variant safetensor files do not exist check for non-variants
-        if not filtered_component_filenames:
-            filtered_component_filenames = filter_with_regex(component_filenames, non_variant_file_re)
-        for component_filename in filtered_component_filenames:
+        for component_filename in component_filenames:
            filename, extension = os.path.splitext(component_filename)

            match_exists = extension == ".safetensors"
@@ -198,10 +159,6 @@ def filter_model_files(filenames):
    return [f for f in filenames if any(f.endswith(extension) for extension in allowed_extensions)]


-def filter_with_regex(filenames, pattern_re):
-    return {f for f in filenames if pattern_re.match(f.split("/")[-1]) is not None}
-
-
 def variant_compatible_siblings(filenames, variant=None, ignore_patterns=None) -> Union[List[os.PathLike], str]:
    weight_names = [
        WEIGHTS_NAME,
@@ -250,6 +207,9 @@ def variant_compatible_siblings(filenames, variant=None, ignore_patterns=None) -
        # interested in the extension name
        return {f for f in filenames if not any(f.endswith(pat.lstrip("*.")) for pat in ignore_patterns)}

+    def filter_with_regex(filenames, pattern_re):
+        return {f for f in filenames if pattern_re.match(f.split("/")[-1]) is not None}
+
    # Group files by component
    components = {}
    for filename in filenames:
@@ -375,14 +335,14 @@ def get_class_obj_and_candidates(
    library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
 ):
    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
-    component_folder = os.path.join(cache_dir, component_name) if component_name and cache_dir else None
+    component_folder = os.path.join(cache_dir, component_name)

    if is_pipeline_module:
        pipeline_module = getattr(pipelines, library_name)

        class_obj = getattr(pipeline_module, class_name)
        class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
-    elif component_folder and os.path.isfile(os.path.join(component_folder, library_name + ".py")):
+    elif os.path.isfile(os.path.join(component_folder, library_name + ".py")):
        # load custom component
        class_obj = get_class_from_dynamic_module(
            component_folder, module_file=library_name + ".py", class_name=class_name
@@ -1037,7 +997,7 @@ def _get_ignore_patterns(
        use_safetensors
        and not allow_pickle
        and not is_safetensors_compatible(
-            model_filenames, passed_components=passed_components, folder_names=model_folder_names, variant=variant
+            model_filenames, passed_components=passed_components, folder_names=model_folder_names
        )
    ):
        raise EnvironmentError(
@@ -1048,7 +1008,7 @@ def _get_ignore_patterns(
        ignore_patterns = ["*.bin", "*.safetensors", "*.onnx", "*.pb"]

    elif use_safetensors and is_safetensors_compatible(
-        model_filenames, passed_components=passed_components, folder_names=model_folder_names, variant=variant
+        model_filenames, passed_components=passed_components, folder_names=model_folder_names
    ):
        ignore_patterns = ["*.bin", "*.msgpack"]

--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -635,10 +635,10 @@ def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -
    return arry


-def load_pt(url: str, map_location: Optional[str] = None, weights_only: Optional[bool] = True):
+def load_pt(url: str, map_location: str):
    response = requests.get(url, timeout=DIFFUSERS_REQUEST_TIMEOUT)
    response.raise_for_status()
-    arry = torch.load(BytesIO(response.content), map_location=map_location, weights_only=weights_only)
+    arry = torch.load(BytesIO(response.content), map_location=map_location)
    return arry


--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1748,14 +1748,14 @@ class TorchCompileTesterMixin:
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test in case of CUDA runtime errors
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -1764,17 +1764,13 @@ class TorchCompileTesterMixin:
    @is_torch_compile
    @slow
    def test_torch_compile_recompilation_and_graph_break(self):
-        torch.compiler.reset()
+        torch._dynamo.reset()
        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

        model = self.model_class(**init_dict).to(torch_device)
        model = torch.compile(model, fullgraph=True)

-        with (
-            torch._inductor.utils.fresh_inductor_cache(),
-            torch._dynamo.config.patch(error_on_recompile=True),
-            torch.no_grad(),
-        ):
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
            _ = model(**inputs_dict)
            _ = model(**inputs_dict)

@@ -1802,7 +1798,7 @@ class LoraHotSwappingForModelTesterMixin:
        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
        # there will be recompilation errors, as torch caches the model when run in the same process.
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -1919,7 +1915,7 @@ class LoraHotSwappingForModelTesterMixin:
    def test_hotswapping_compiled_model_linear(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
@@ -1929,7 +1925,7 @@ class LoraHotSwappingForModelTesterMixin:

        # It's important to add this context to raise an error on recompilation
        target_modules = ["conv", "conv1", "conv2"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
@@ -1939,7 +1935,7 @@ class LoraHotSwappingForModelTesterMixin:

        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "conv"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
--- a/tests/models/test_models_auto.py
+++ b/tests/models/test_models_auto.py
@@ -1,32 +0,0 @@
-import unittest
-from unittest.mock import patch
-
-from transformers import CLIPTextModel, LongformerModel
-
-from diffusers.models import AutoModel, UNet2DConditionModel
-
-
-class TestAutoModel(unittest.TestCase):
-    @patch(
-        "diffusers.models.AutoModel.load_config",
-        side_effect=[EnvironmentError("File not found"), {"_class_name": "UNet2DConditionModel"}],
-    )
-    def test_load_from_config_diffusers_with_subfolder(self, mock_load_config):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet")
-        assert isinstance(model, UNet2DConditionModel)
-
-    @patch(
-        "diffusers.models.AutoModel.load_config",
-        side_effect=[EnvironmentError("File not found"), {"model_type": "clip_text_model"}],
-    )
-    def test_load_from_config_transformers_with_subfolder(self, mock_load_config):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder")
-        assert isinstance(model, CLIPTextModel)
-
-    def test_load_from_config_without_subfolder(self):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-longformer")
-        assert isinstance(model, LongformerModel)
-
-    def test_load_from_model_index(self):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder")
-        assert isinstance(model, CLIPTextModel)
--- a/tests/models/transformers/test_models_transformer_hunyuan_video.py
+++ b/tests/models/transformers/test_models_transformer_hunyuan_video.py
@@ -19,16 +19,20 @@ import torch
 from diffusers import HunyuanVideoTransformer3DModel
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
+    is_torch_compile,
+    require_torch_2,
+    require_torch_gpu,
+    slow,
    torch_device,
 )

-from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+from ..test_modeling_common import ModelTesterMixin


 enable_full_determinism()


-class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -92,8 +96,23 @@ class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin,
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -160,8 +179,23 @@ class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompi
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -226,10 +260,23 @@ class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileT
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)

+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()

-class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(
-    ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase
-):
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
+
+
+class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = HunyuanVideoTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -295,3 +342,18 @@ class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"HunyuanVideoTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
--- a/tests/models/transformers/test_models_transformer_wan.py
+++ b/tests/models/transformers/test_models_transformer_wan.py
@@ -19,16 +19,20 @@ import torch
 from diffusers import WanTransformer3DModel
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
+    is_torch_compile,
+    require_torch_2,
+    require_torch_gpu,
+    slow,
    torch_device,
 )

-from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
+from ..test_modeling_common import ModelTesterMixin


 enable_full_determinism()


-class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
+class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase):
    model_class = WanTransformer3DModel
    main_input_name = "hidden_states"
    uses_custom_attn_processor = True
@@ -82,3 +86,18 @@ class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.
    def test_gradient_checkpointing_is_applied(self):
        expected_set = {"WanTransformer3DModel"}
        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    @require_torch_gpu
+    @require_torch_2
+    @is_torch_compile
+    @slow
+    def test_torch_compile_recompilation_and_graph_break(self):
+        torch._dynamo.reset()
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict).to(torch_device)
+        model = torch.compile(model, fullgraph=True)
+
+        with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
+            _ = model(**inputs_dict)
+            _ = model(**inputs_dict)
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -15,6 +15,7 @@

 import gc
 import tempfile
+import traceback
 import unittest

 import numpy as np
@@ -38,9 +39,13 @@ from diffusers.utils.testing_utils import (
    backend_reset_max_memory_allocated,
    backend_reset_peak_memory_stats,
    enable_full_determinism,
+    get_python_version,
+    is_torch_compile,
    load_image,
    load_numpy,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -63,6 +68,52 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        _ = in_queue.get(timeout=timeout)
+
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.controlnet.to(memory_format=torch.channels_last)
+        pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+        )
+        expected_image = np.resize(expected_image, (512, 512, 3))
+
+        assert np.abs(expected_image - image).max() < 1.0
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class ControlNetPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -1002,6 +1053,15 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

+    @is_torch_compile
+    @require_torch_2
+    @unittest.skipIf(
+        get_python_version == (3, 12),
+        reason="Torch Dynamo isn't yet supported for Python 3.12.",
+    )
+    def test_stable_diffusion_compile(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
+
    def test_v11_shuffle_global_pool_conditions(self):
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")

--- a/tests/pipelines/controlnet_xs/test_controlnetxs.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py
@@ -14,6 +14,7 @@
 # limitations under the License.

 import gc
+import traceback
 import unittest

 import numpy as np
@@ -35,9 +36,13 @@ from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
    backend_empty_cache,
    enable_full_determinism,
+    is_torch_compile,
    load_image,
+    load_numpy,
    require_accelerator,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -73,6 +78,53 @@ def to_np(tensor):
    return tensor


+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        _ = in_queue.get(timeout=timeout)
+
+        controlnet = ControlNetXSAdapter.from_pretrained(
+            "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
+        )
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base",
+            controlnet=controlnet,
+            safety_checker=None,
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+        )
+        expected_image = np.resize(expected_image, (512, 512, 3))
+
+        assert np.abs(expected_image - image).max() < 1.0
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class ControlNetXSPipelineFastTests(
    PipelineLatentTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
@@ -350,3 +402,8 @@ class ControlNetXSPipelineSlowTests(unittest.TestCase):
        original_image = image[-3:, -3:, -1].flatten()
        expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+    @is_torch_compile
+    @require_torch_2
+    def test_stable_diffusion_compile(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
--- a/tests/pipelines/hidream_image/init.py
+++ b/tests/pipelines/hidream_image/init.py
--- a/tests/pipelines/hidream_image/test_pipeline_hidream.py
+++ b/tests/pipelines/hidream_image/test_pipeline_hidream.py
--- a/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
+++ b/tests/pipelines/stable_cascade/test_stable_cascade_decoder.py
@@ -304,8 +304,7 @@ class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase):

        generator = torch.Generator(device="cpu").manual_seed(0)
        image_embedding = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/image_embedding.pt",
-            map_location=torch_device,
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/image_embedding.pt"
        )

        image = pipe(
@@ -321,4 +320,4 @@ class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase):
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_cascade/stable_cascade_decoder_image.npy"
        )
        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
-        assert max_diff < 2e-4
+        assert max_diff < 1e-4
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -17,6 +17,7 @@
 import gc
 import tempfile
 import time
+import traceback
 import unittest

 import numpy as np
@@ -48,12 +49,16 @@ from diffusers.utils.testing_utils import (
    backend_reset_max_memory_allocated,
    backend_reset_peak_memory_stats,
    enable_full_determinism,
+    is_torch_compile,
+    load_image,
    load_numpy,
    nightly,
    numpy_cosine_similarity_distance,
    require_accelerate_version_greater,
+    require_torch_2,
    require_torch_accelerator,
    require_torch_multi_accelerator,
+    run_test_in_subprocess,
    skip_mps,
    slow,
    torch_device,
@@ -76,6 +81,39 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+
+        sd_pipe.unet.to(memory_format=torch.channels_last)
+        sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+
+        assert np.abs(image_slice - expected_slice).max() < 5e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class StableDiffusionPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -1186,6 +1224,40 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 8e-1

+    @is_torch_compile
+    @require_torch_2
+    def test_stable_diffusion_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
+
+    def test_stable_diffusion_lcm(self):
+        unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 6
+        inputs["output_type"] = "pil"
+
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png"
+        )
+
+        image = sd_pipe.image_processor.pil_to_numpy(image)
+        expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)
+
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
+
+        assert max_diff < 1e-2
+

@slow
@require_torch_accelerator
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -15,6 +15,7 @@

 import gc
 import random
+import traceback
 import unittest

 import numpy as np
@@ -40,10 +41,13 @@ from diffusers.utils.testing_utils import (
    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
+    is_torch_compile,
    load_image,
    load_numpy,
    nightly,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    skip_mps,
    slow,
    torch_device,
@@ -66,6 +70,38 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_img2img_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806])
+
+        assert np.abs(expected_slice - image_slice).max() < 1e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class StableDiffusionImg2ImgPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -618,6 +654,17 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
        assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
        assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros

+    @is_torch_compile
+    @require_torch_2
+    def test_img2img_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs)
+

@nightly
@require_torch_accelerator
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -15,6 +15,7 @@

 import gc
 import random
+import traceback
 import unittest

 import numpy as np
@@ -43,10 +44,13 @@ from diffusers.utils.testing_utils import (
    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
+    is_torch_compile,
    load_image,
    load_numpy,
    nightly,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    slow,
    torch_device,
 )
@@ -67,6 +71,40 @@ from ..test_pipelines_common import (
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_inpaint_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, 253:256, 253:256, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0689, 0.0699, 0.0790, 0.0536, 0.0470, 0.0488, 0.041, 0.0508, 0.04179])
+        assert np.abs(expected_slice - image_slice).max() < 3e-3
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class StableDiffusionInpaintPipelineFastTests(
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
@@ -689,6 +727,17 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
        # make sure that less than 2.2 GB is allocated
        assert mem_bytes < 2.2 * 10**9

+    @is_torch_compile
+    @require_torch_2
+    def test_inpaint_compile(self):
+        seed = 0
+        inputs = self.get_inputs(torch_device, seed=seed)
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs)
+
    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
        pipe = StableDiffusionInpaintPipeline.from_pretrained(
            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None
@@ -915,6 +964,11 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
        # make sure that less than 2.45 GB is allocated
        assert mem_bytes < 2.45 * 10**9

+    @is_torch_compile
+    @require_torch_2
+    def test_inpaint_compile(self):
+        pass
+
    def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
        vae = AsymmetricAutoencoderKL.from_pretrained(
            "cross-attention/asymmetric-autoencoder-kl-x-1-5",
--- a/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py
+++ b/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py
@@ -20,32 +20,26 @@ import numpy as np
 import torch

 from diffusers import StableDiffusionKDiffusionPipeline
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    nightly,
-    require_torch_accelerator,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device


 enable_full_determinism()


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_stable_diffusion_1(self):
        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
--- a/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py
+++ b/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py
@@ -28,13 +28,7 @@ from diffusers import (
    StableDiffusionLDM3DPipeline,
    UNet2DConditionModel,
 )
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    nightly,
-    require_torch_accelerator,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS

@@ -211,17 +205,17 @@ class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase):


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
@@ -262,17 +256,17 @@ class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase):


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class StableDiffusionPipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
--- a/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py
@@ -29,13 +29,7 @@ from diffusers import (
    StableDiffusionSAGPipeline,
    UNet2DConditionModel,
 )
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    enable_full_determinism,
-    nightly,
-    require_torch_accelerator,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import (
@@ -168,19 +162,19 @@ class StableDiffusionSAGPipelineFastTests(


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_stable_diffusion_1(self):
        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -13,17 +13,7 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    backend_max_memory_allocated,
-    backend_reset_max_memory_allocated,
-    backend_reset_peak_memory_stats,
-    enable_full_determinism,
-    load_numpy,
-    nightly,
-    require_torch_accelerator,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import (
@@ -200,19 +190,19 @@ class StableUnCLIPPipelineFastTests(


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_stable_unclip(self):
        expected_image = load_numpy(
@@ -236,9 +226,9 @@ class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
        assert_mean_pixel_difference(image, expected_image)

    def test_stable_unclip_pipeline_with_sequential_cpu_offloading(self):
-        backend_empty_cache(torch_device)
-        backend_reset_max_memory_allocated(torch_device)
-        backend_reset_peak_memory_stats(torch_device)
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()

        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
        pipe.set_progress_bar_config(disable=None)
@@ -252,6 +242,6 @@ class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
            output_type="np",
        )

-        mem_bytes = backend_max_memory_allocated(torch_device)
+        mem_bytes = torch.cuda.max_memory_allocated()
        # make sure that less than 7 GB is allocated
        assert mem_bytes < 7 * 10**9
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -18,16 +18,12 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    backend_max_memory_allocated,
-    backend_reset_max_memory_allocated,
-    backend_reset_peak_memory_stats,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
-    require_torch_accelerator,
+    require_torch_gpu,
    skip_mps,
    torch_device,
 )
@@ -217,19 +213,19 @@ class StableUnCLIPImg2ImgPipelineFastTests(


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_stable_unclip_l_img2img(self):
        input_image = load_image(
@@ -290,9 +286,9 @@ class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
        )

-        backend_empty_cache(torch_device)
-        backend_reset_max_memory_allocated(torch_device)
-        backend_reset_peak_memory_stats(torch_device)
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()

        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
@@ -308,6 +304,6 @@ class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
            output_type="np",
        )

-        mem_bytes = backend_max_memory_allocated(torch_device)
+        mem_bytes = torch.cuda.max_memory_allocated()
        # make sure that less than 7 GB is allocated
        assert mem_bytes < 7 * 10**9
--- a/tests/pipelines/test_pipeline_utils.py
+++ b/tests/pipelines/test_pipeline_utils.py
@@ -87,24 +87,21 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
            "unet/diffusion_pytorch_model.fp16.bin",
            "unet/diffusion_pytorch_model.fp16.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames))

    def test_diffusers_model_is_compatible_variant(self):
        filenames = [
            "unet/diffusion_pytorch_model.fp16.bin",
            "unet/diffusion_pytorch_model.fp16.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames))

    def test_diffusers_model_is_compatible_variant_mixed(self):
        filenames = [
            "unet/diffusion_pytorch_model.bin",
            "unet/diffusion_pytorch_model.fp16.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames))

    def test_diffusers_model_is_not_compatible_variant(self):
        filenames = [
@@ -124,8 +121,7 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
            "text_encoder/pytorch_model.fp16.bin",
            "text_encoder/model.fp16.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames))

    def test_transformer_model_is_not_compatible_variant(self):
        filenames = [
@@ -149,8 +145,7 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
            "unet/diffusion_pytorch_model.fp16.bin",
            "unet/diffusion_pytorch_model.fp16.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames, folder_names={"vae", "unet"}))
-        self.assertTrue(is_safetensors_compatible(filenames, folder_names={"vae", "unet"}, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames, folder_names={"vae", "unet"}))

    def test_transformer_model_is_not_compatible_variant_extra_folder(self):
        filenames = [
@@ -178,8 +173,7 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
            "text_encoder/model.fp16-00001-of-00002.safetensors",
            "text_encoder/model.fp16-00001-of-00002.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames))

    def test_diffusers_is_compatible_sharded(self):
        filenames = [
@@ -195,15 +189,13 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
            "unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
            "unet/diffusion_pytorch_model.fp16-00001-of-00002.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames))

    def test_diffusers_is_compatible_only_variants(self):
        filenames = [
            "unet/diffusion_pytorch_model.fp16.safetensors",
        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
+        self.assertTrue(is_safetensors_compatible(filenames))

    def test_diffusers_is_compatible_no_components(self):
        filenames = [
@@ -217,20 +209,6 @@ class IsSafetensorsCompatibleTests(unittest.TestCase):
        ]
        self.assertFalse(is_safetensors_compatible(filenames))

-    def test_is_compatible_mixed_variants(self):
-        filenames = [
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.safetensors",
-        ]
-        self.assertTrue(is_safetensors_compatible(filenames, variant="fp16"))
-
-    def test_is_compatible_variant_and_non_safetensors(self):
-        filenames = [
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.bin",
-        ]
-        self.assertFalse(is_safetensors_compatible(filenames, variant="fp16"))
-

 class VariantCompatibleSiblingsTest(unittest.TestCase):
    def test_only_non_variants_downloaded(self):
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -588,17 +588,20 @@ class DownloadTests(unittest.TestCase):
        logger = logging.get_logger("diffusers.pipelines.pipeline_utils")
        deprecated_warning_msg = "Warning: The repository contains sharded checkpoints for variant"

-        with CaptureLogger(logger) as cap_logger:
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                local_repo_id = snapshot_download(repo_id, cache_dir=tmpdirname)
+        for is_local in [True, False]:
+            with CaptureLogger(logger) as cap_logger:
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    local_repo_id = repo_id
+                    if is_local:
+                        local_repo_id = snapshot_download(repo_id, cache_dir=tmpdirname)

-                _ = DiffusionPipeline.from_pretrained(
-                    local_repo_id,
-                    safety_checker=None,
-                    variant="fp16",
-                    use_safetensors=True,
-                )
-        assert deprecated_warning_msg in str(cap_logger), "Deprecation warning not found in logs"
+                    _ = DiffusionPipeline.from_pretrained(
+                        local_repo_id,
+                        safety_checker=None,
+                        variant="fp16",
+                        use_safetensors=True,
+                    )
+            assert deprecated_warning_msg in str(cap_logger), "Deprecation warning not found in logs"

    def test_download_safetensors_only_variant_exists_for_model(self):
        variant = None
@@ -613,7 +616,7 @@ class DownloadTests(unittest.TestCase):
                    variant=variant,
                    use_safetensors=use_safetensors,
                )
-            assert "Could not find the necessary `safetensors` weights" in str(error_context.exception)
+            assert "Error no file name" in str(error_context.exception)

        # text encoder has fp16 variants so we can load it
        with tempfile.TemporaryDirectory() as tmpdirname:
@@ -672,7 +675,7 @@ class DownloadTests(unittest.TestCase):
                    use_safetensors=use_safetensors,
                )

-            assert "Could not find the necessary `safetensors` weights" in str(error_context.exception)
+            assert "Error no file name" in str(error_context.exception)

    def test_download_bin_variant_does_not_exist_for_model(self):
        variant = "no_ema"
@@ -1994,9 +1997,7 @@ class PipelineSlowTests(unittest.TestCase):
        reason="Torch Dynamo isn't yet supported for Python 3.12.",
    )
    def test_from_save_pretrained_dynamo(self):
-        torch.compiler.rest()
-        with torch._inductor.utils.fresh_inductor_cache():
-            run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)
+        run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)

    def test_from_pretrained_hub(self):
        model_path = "google/ddpm-cifar10-32"
@@ -2208,7 +2209,7 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
        # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
        # there will be recompilation errors, as torch caches the model when run in the same process.
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

@@ -2333,21 +2334,21 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
    def test_hotswapping_compiled_pipline_linear(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_pipline_conv2d(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["conv", "conv1", "conv2"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1):
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "conv"]
-        with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
+        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -1111,14 +1111,14 @@ class PipelineTesterMixin:
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test in case of CUDA runtime errors
        super().tearDown()
-        torch.compiler.reset()
+        torch._dynamo.reset()
        gc.collect()
        backend_empty_cache(torch_device)

--- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero.py
@@ -19,44 +19,37 @@ import unittest
 import torch

 from diffusers import DDIMScheduler, TextToVideoZeroPipeline
-from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    load_pt,
-    nightly,
-    require_torch_accelerator,
-    torch_device,
-)
+from diffusers.utils.testing_utils import load_pt, nightly, require_torch_gpu

 from ..test_pipelines_common import assert_mean_pixel_difference


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_full_model(self):
        model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
+        pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        generator = torch.Generator(device="cpu").manual_seed(0)
+        generator = torch.Generator(device="cuda").manual_seed(0)

        prompt = "A bear is playing a guitar on Times Square"
        result = pipe(prompt=prompt, generator=generator).images

        expected_result = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt",
-            weights_only=False,
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt"
        )

        assert_mean_pixel_difference(result, expected_result)
--- a/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video_zero_sdxl.py
@@ -24,11 +24,11 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProject

 from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoZeroSDXLPipeline, UNet2DConditionModel
 from diffusers.utils.testing_utils import (
-    backend_empty_cache,
    enable_full_determinism,
    nightly,
    require_accelerate_version_greater,
-    require_torch_accelerator,
+    require_accelerator,
+    require_torch_gpu,
    torch_device,
 )

@@ -220,7 +220,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
        self.assertLess(max_diff, expected_max_difference)

    @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
-    @require_torch_accelerator
+    @require_accelerator
    def test_float16_inference(self, expected_max_diff=5e-2):
        components = self.get_dummy_components()
        for name, module in components.items():
@@ -262,7 +262,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
    def test_inference_batch_single_identical(self):
        pass

-    @require_torch_accelerator
+    @require_accelerator
    @require_accelerate_version_greater("0.17.0")
    def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
        components = self.get_dummy_components()
@@ -285,7 +285,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
        pass

    @unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
-    @require_torch_accelerator
+    @require_accelerator
    def test_save_load_float16(self, expected_max_diff=1e-2):
        components = self.get_dummy_components()
        for name, module in components.items():
@@ -337,7 +337,7 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
    def test_sequential_cpu_offload_forward_pass(self):
        pass

-    @require_torch_accelerator
+    @require_accelerator
    def test_to_device(self):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
@@ -365,19 +365,19 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class TextToVideoZeroSDXLPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_full_model(self):
        model_id = "stabilityai/stable-diffusion-xl-base-1.0"
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -23,14 +23,10 @@ from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokeni
 from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
 from diffusers.utils.testing_utils import (
-    backend_empty_cache,
-    backend_max_memory_allocated,
-    backend_reset_max_memory_allocated,
-    backend_reset_peak_memory_stats,
    enable_full_determinism,
    load_numpy,
    nightly,
-    require_torch_accelerator,
+    require_torch_gpu,
    skip_mps,
    torch_device,
 )
@@ -430,13 +426,13 @@ class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_unclip_karlo_cpu_fp32(self):
        expected_image = load_numpy(
@@ -462,19 +458,19 @@ class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class UnCLIPPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_unclip_karlo(self):
        expected_image = load_numpy(
@@ -500,9 +496,9 @@ class UnCLIPPipelineIntegrationTests(unittest.TestCase):
        assert_mean_pixel_difference(image, expected_image)

    def test_unclip_pipeline_with_sequential_cpu_offloading(self):
-        backend_empty_cache(torch_device)
-        backend_reset_max_memory_allocated(torch_device)
-        backend_reset_peak_memory_stats(torch_device)
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()

        pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
        pipe.set_progress_bar_config(disable=None)
@@ -518,6 +514,6 @@ class UnCLIPPipelineIntegrationTests(unittest.TestCase):
            output_type="np",
        )

-        mem_bytes = backend_max_memory_allocated(torch_device)
+        mem_bytes = torch.cuda.max_memory_allocated()
        # make sure that less than 7 GB is allocated
        assert mem_bytes < 7 * 10**9
--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -37,13 +37,12 @@ from diffusers import (
 )
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
 from diffusers.utils.testing_utils import (
-    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
-    require_torch_accelerator,
+    require_torch_gpu,
    skip_mps,
    torch_device,
 )
@@ -497,19 +496,19 @@ class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCa


@nightly
-@require_torch_accelerator
+@require_torch_gpu
 class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_unclip_image_variation_karlo(self):
        input_image = load_image(
--- a/tests/pipelines/unidiffuser/test_unidiffuser.py
+++ b/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -1,5 +1,6 @@
 import gc
 import random
+import traceback
 import unittest

 import numpy as np
@@ -26,7 +27,9 @@ from diffusers.utils.testing_utils import (
    floats_tensor,
    load_image,
    nightly,
+    require_torch_2,
    require_torch_accelerator,
+    run_test_in_subprocess,
    torch_device,
 )
 from diffusers.utils.torch_utils import randn_tensor
@@ -42,6 +45,38 @@ from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, Pipeline
 enable_full_determinism()


+# Will be run via run_test_in_subprocess
+def _test_unidiffuser_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        inputs = in_queue.get(timeout=timeout)
+        torch_device = inputs.pop("torch_device")
+        seed = inputs.pop("seed")
+        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
+
+        pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
+        # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(torch_device)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.set_progress_bar_config(disable=None)
+
+        image = pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
+        assert np.abs(image_slice - expected_slice).max() < 1e-1
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
 class UniDiffuserPipelineFastTests(
    PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
 ):
@@ -655,6 +690,19 @@ class UniDiffuserPipelineSlowTests(unittest.TestCase):
        expected_text_prefix = "An astronaut"
        assert text[0][: len(expected_text_prefix)] == expected_text_prefix

+    @unittest.skip(reason="Skip torch.compile test to speed up the slow test suite.")
+    @require_torch_2
+    def test_unidiffuser_compile(self, seed=0):
+        inputs = self.get_inputs(torch_device, seed=seed, generate_latents=True)
+        # Delete prompt and image for joint inference.
+        del inputs["prompt"]
+        del inputs["image"]
+        # Can't pickle a Generator object
+        del inputs["generator"]
+        inputs["torch_device"] = torch_device
+        inputs["seed"] = seed
+        run_test_in_subprocess(test_case=self, target_func=_test_unidiffuser_compile, inputs=inputs)
+

@nightly
@require_torch_accelerator