mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-06 20:44:33 +08:00
Compare commits
1 Commits
attn-refac
...
revert-115
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2a7bb53391 |
2
.github/workflows/benchmark.yml
vendored
2
.github/workflows/benchmark.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
runs-on:
|
||||
group: aws-g6-4xlarge-plus
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
image: diffusers/diffusers-pytorch-compile-cuda
|
||||
options: --shm-size "16gb" --ipc host --gpus 0
|
||||
steps:
|
||||
- name: Checkout diffusers
|
||||
|
||||
8
.github/workflows/build_docker_images.yml
vendored
8
.github/workflows/build_docker_images.yml
vendored
@@ -41,12 +41,6 @@ jobs:
|
||||
run: |
|
||||
CHANGED_FILES="${{ steps.file_changes.outputs.all }}"
|
||||
for FILE in $CHANGED_FILES; do
|
||||
# skip anything that isn’t still on disk
|
||||
if [[ ! -f "$FILE" ]]; then
|
||||
echo "Skipping removed file $FILE"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ "$FILE" == docker/*Dockerfile ]]; then
|
||||
DOCKER_PATH="${FILE%/Dockerfile}"
|
||||
DOCKER_TAG=$(basename "$DOCKER_PATH")
|
||||
@@ -71,7 +65,7 @@ jobs:
|
||||
image-name:
|
||||
- diffusers-pytorch-cpu
|
||||
- diffusers-pytorch-cuda
|
||||
- diffusers-pytorch-cuda
|
||||
- diffusers-pytorch-compile-cuda
|
||||
- diffusers-pytorch-xformers-cuda
|
||||
- diffusers-pytorch-minimum-cuda
|
||||
- diffusers-flax-cpu
|
||||
|
||||
2
.github/workflows/nightly_tests.yml
vendored
2
.github/workflows/nightly_tests.yml
vendored
@@ -188,7 +188,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
image: diffusers/diffusers-pytorch-compile-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
|
||||
2
.github/workflows/push_tests.yml
vendored
2
.github/workflows/push_tests.yml
vendored
@@ -262,7 +262,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
image: diffusers/diffusers-pytorch-compile-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
|
||||
2
.github/workflows/release_tests_fast.yml
vendored
2
.github/workflows/release_tests_fast.yml
vendored
@@ -316,7 +316,7 @@ jobs:
|
||||
group: aws-g4dn-2xlarge
|
||||
|
||||
container:
|
||||
image: diffusers/diffusers-pytorch-cuda
|
||||
image: diffusers/diffusers-pytorch-compile-cuda
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host
|
||||
|
||||
steps:
|
||||
|
||||
50
docker/diffusers-pytorch-compile-cuda/Dockerfile
Normal file
50
docker/diffusers-pytorch-compile-cuda/Dockerfile
Normal file
@@ -0,0 +1,50 @@
|
||||
FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3.10 -m uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
invisible_watermark && \
|
||||
python3.10 -m pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
hf_transfer \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
numpy==1.26.4 \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers \
|
||||
hf_transfer
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1748,14 +1748,14 @@ class TorchCompileTesterMixin:
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
torch.compiler.reset()
|
||||
torch._dynamo.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test in case of CUDA runtime errors
|
||||
super().tearDown()
|
||||
torch.compiler.reset()
|
||||
torch._dynamo.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
@@ -1764,17 +1764,13 @@ class TorchCompileTesterMixin:
|
||||
@is_torch_compile
|
||||
@slow
|
||||
def test_torch_compile_recompilation_and_graph_break(self):
|
||||
torch.compiler.reset()
|
||||
torch._dynamo.reset()
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model = torch.compile(model, fullgraph=True)
|
||||
|
||||
with (
|
||||
torch._inductor.utils.fresh_inductor_cache(),
|
||||
torch._dynamo.config.patch(error_on_recompile=True),
|
||||
torch.no_grad(),
|
||||
):
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
@@ -1802,7 +1798,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
# It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
|
||||
# there will be recompilation errors, as torch caches the model when run in the same process.
|
||||
super().tearDown()
|
||||
torch.compiler.reset()
|
||||
torch._dynamo.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
@@ -1919,7 +1915,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
def test_hotswapping_compiled_model_linear(self, rank0, rank1):
|
||||
# It's important to add this context to raise an error on recompilation
|
||||
target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
|
||||
with torch._dynamo.config.patch(error_on_recompile=True):
|
||||
self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
|
||||
|
||||
@parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa
|
||||
@@ -1929,7 +1925,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
|
||||
# It's important to add this context to raise an error on recompilation
|
||||
target_modules = ["conv", "conv1", "conv2"]
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
|
||||
with torch._dynamo.config.patch(error_on_recompile=True):
|
||||
self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
|
||||
|
||||
@parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa
|
||||
@@ -1939,7 +1935,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
|
||||
# It's important to add this context to raise an error on recompilation
|
||||
target_modules = ["to_q", "conv"]
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
|
||||
with torch._dynamo.config.patch(error_on_recompile=True):
|
||||
self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
|
||||
|
||||
@parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa
|
||||
|
||||
@@ -19,16 +19,20 @@ import torch
|
||||
from diffusers import HunyuanVideoTransformer3DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
is_torch_compile,
|
||||
require_torch_2,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
|
||||
from ..test_modeling_common import ModelTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
|
||||
class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = HunyuanVideoTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
@@ -92,8 +96,23 @@ class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin,
|
||||
expected_set = {"HunyuanVideoTransformer3DModel"}
|
||||
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_2
|
||||
@is_torch_compile
|
||||
@slow
|
||||
def test_torch_compile_recompilation_and_graph_break(self):
|
||||
torch._dynamo.reset()
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model = torch.compile(model, fullgraph=True)
|
||||
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
|
||||
class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = HunyuanVideoTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
@@ -160,8 +179,23 @@ class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompi
|
||||
expected_set = {"HunyuanVideoTransformer3DModel"}
|
||||
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_2
|
||||
@is_torch_compile
|
||||
@slow
|
||||
def test_torch_compile_recompilation_and_graph_break(self):
|
||||
torch._dynamo.reset()
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model = torch.compile(model, fullgraph=True)
|
||||
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
|
||||
class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = HunyuanVideoTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
@@ -226,10 +260,23 @@ class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileT
|
||||
expected_set = {"HunyuanVideoTransformer3DModel"}
|
||||
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_2
|
||||
@is_torch_compile
|
||||
@slow
|
||||
def test_torch_compile_recompilation_and_graph_break(self):
|
||||
torch._dynamo.reset()
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(
|
||||
ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase
|
||||
):
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model = torch.compile(model, fullgraph=True)
|
||||
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
|
||||
class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = HunyuanVideoTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
@@ -295,3 +342,18 @@ class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(
|
||||
def test_gradient_checkpointing_is_applied(self):
|
||||
expected_set = {"HunyuanVideoTransformer3DModel"}
|
||||
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_2
|
||||
@is_torch_compile
|
||||
@slow
|
||||
def test_torch_compile_recompilation_and_graph_break(self):
|
||||
torch._dynamo.reset()
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model = torch.compile(model, fullgraph=True)
|
||||
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
@@ -19,16 +19,20 @@ import torch
|
||||
from diffusers import WanTransformer3DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
is_torch_compile,
|
||||
require_torch_2,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
|
||||
from ..test_modeling_common import ModelTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
|
||||
class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = WanTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
@@ -82,3 +86,18 @@ class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.
|
||||
def test_gradient_checkpointing_is_applied(self):
|
||||
expected_set = {"WanTransformer3DModel"}
|
||||
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
|
||||
|
||||
@require_torch_gpu
|
||||
@require_torch_2
|
||||
@is_torch_compile
|
||||
@slow
|
||||
def test_torch_compile_recompilation_and_graph_break(self):
|
||||
torch._dynamo.reset()
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model = torch.compile(model, fullgraph=True)
|
||||
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import traceback
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -38,9 +39,13 @@ from diffusers.utils.testing_utils import (
|
||||
backend_reset_max_memory_allocated,
|
||||
backend_reset_peak_memory_stats,
|
||||
enable_full_determinism,
|
||||
get_python_version,
|
||||
is_torch_compile,
|
||||
load_image,
|
||||
load_numpy,
|
||||
require_torch_2,
|
||||
require_torch_accelerator,
|
||||
run_test_in_subprocess,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -63,6 +68,52 @@ from ..test_pipelines_common import (
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
# Will be run via run_test_in_subprocess
|
||||
def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
|
||||
error = None
|
||||
try:
|
||||
_ = in_queue.get(timeout=timeout)
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
|
||||
|
||||
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
|
||||
)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe.unet.to(memory_format=torch.channels_last)
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
pipe.controlnet.to(memory_format=torch.channels_last)
|
||||
pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
prompt = "bird"
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
|
||||
).resize((512, 512))
|
||||
|
||||
output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
|
||||
image = output.images[0]
|
||||
|
||||
assert image.shape == (512, 512, 3)
|
||||
|
||||
expected_image = load_numpy(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
|
||||
)
|
||||
expected_image = np.resize(expected_image, (512, 512, 3))
|
||||
|
||||
assert np.abs(expected_image - image).max() < 1.0
|
||||
|
||||
except Exception:
|
||||
error = f"{traceback.format_exc()}"
|
||||
|
||||
results = {"error": error}
|
||||
out_queue.put(results, timeout=timeout)
|
||||
out_queue.join()
|
||||
|
||||
|
||||
class ControlNetPipelineFastTests(
|
||||
IPAdapterTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
@@ -1002,6 +1053,15 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
|
||||
expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@is_torch_compile
|
||||
@require_torch_2
|
||||
@unittest.skipIf(
|
||||
get_python_version == (3, 12),
|
||||
reason="Torch Dynamo isn't yet supported for Python 3.12.",
|
||||
)
|
||||
def test_stable_diffusion_compile(self):
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
|
||||
|
||||
def test_v11_shuffle_global_pool_conditions(self):
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import traceback
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -35,9 +36,13 @@ from diffusers.utils.import_utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
is_torch_compile,
|
||||
load_image,
|
||||
load_numpy,
|
||||
require_accelerator,
|
||||
require_torch_2,
|
||||
require_torch_accelerator,
|
||||
run_test_in_subprocess,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -73,6 +78,53 @@ def to_np(tensor):
|
||||
return tensor
|
||||
|
||||
|
||||
# Will be run via run_test_in_subprocess
|
||||
def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
|
||||
error = None
|
||||
try:
|
||||
_ = in_queue.get(timeout=timeout)
|
||||
|
||||
controlnet = ControlNetXSAdapter.from_pretrained(
|
||||
"UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1-base",
|
||||
controlnet=controlnet,
|
||||
safety_checker=None,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe.unet.to(memory_format=torch.channels_last)
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
prompt = "bird"
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
|
||||
).resize((512, 512))
|
||||
|
||||
output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
|
||||
image = output.images[0]
|
||||
|
||||
assert image.shape == (512, 512, 3)
|
||||
|
||||
expected_image = load_numpy(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
|
||||
)
|
||||
expected_image = np.resize(expected_image, (512, 512, 3))
|
||||
|
||||
assert np.abs(expected_image - image).max() < 1.0
|
||||
|
||||
except Exception:
|
||||
error = f"{traceback.format_exc()}"
|
||||
|
||||
results = {"error": error}
|
||||
out_queue.put(results, timeout=timeout)
|
||||
out_queue.join()
|
||||
|
||||
|
||||
class ControlNetXSPipelineFastTests(
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
@@ -350,3 +402,8 @@ class ControlNetXSPipelineSlowTests(unittest.TestCase):
|
||||
original_image = image[-3:, -3:, -1].flatten()
|
||||
expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
|
||||
assert np.allclose(original_image, expected_image, atol=1e-04)
|
||||
|
||||
@is_torch_compile
|
||||
@require_torch_2
|
||||
def test_stable_diffusion_compile(self):
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import gc
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -48,12 +49,16 @@ from diffusers.utils.testing_utils import (
|
||||
backend_reset_max_memory_allocated,
|
||||
backend_reset_peak_memory_stats,
|
||||
enable_full_determinism,
|
||||
is_torch_compile,
|
||||
load_image,
|
||||
load_numpy,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_accelerate_version_greater,
|
||||
require_torch_2,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
run_test_in_subprocess,
|
||||
skip_mps,
|
||||
slow,
|
||||
torch_device,
|
||||
@@ -76,6 +81,39 @@ from ..test_pipelines_common import (
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
# Will be run via run_test_in_subprocess
|
||||
def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
|
||||
error = None
|
||||
try:
|
||||
inputs = in_queue.get(timeout=timeout)
|
||||
torch_device = inputs.pop("torch_device")
|
||||
seed = inputs.pop("seed")
|
||||
inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
|
||||
|
||||
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
|
||||
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
|
||||
sd_pipe.unet.to(memory_format=torch.channels_last)
|
||||
sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
image = sd_pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
|
||||
|
||||
assert np.abs(image_slice - expected_slice).max() < 5e-3
|
||||
except Exception:
|
||||
error = f"{traceback.format_exc()}"
|
||||
|
||||
results = {"error": error}
|
||||
out_queue.put(results, timeout=timeout)
|
||||
out_queue.join()
|
||||
|
||||
|
||||
class StableDiffusionPipelineFastTests(
|
||||
IPAdapterTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
@@ -1186,6 +1224,40 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
|
||||
max_diff = np.abs(expected_image - image).max()
|
||||
assert max_diff < 8e-1
|
||||
|
||||
@is_torch_compile
|
||||
@require_torch_2
|
||||
def test_stable_diffusion_compile(self):
|
||||
seed = 0
|
||||
inputs = self.get_inputs(torch_device, seed=seed)
|
||||
# Can't pickle a Generator object
|
||||
del inputs["generator"]
|
||||
inputs["torch_device"] = torch_device
|
||||
inputs["seed"] = seed
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
|
||||
|
||||
def test_stable_diffusion_lcm(self):
|
||||
unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet")
|
||||
sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device)
|
||||
sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
inputs["num_inference_steps"] = 6
|
||||
inputs["output_type"] = "pil"
|
||||
|
||||
image = sd_pipe(**inputs).images[0]
|
||||
|
||||
expected_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png"
|
||||
)
|
||||
|
||||
image = sd_pipe.image_processor.pil_to_numpy(image)
|
||||
expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
|
||||
|
||||
assert max_diff < 1e-2
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
import gc
|
||||
import random
|
||||
import traceback
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -40,10 +41,13 @@ from diffusers.utils.testing_utils import (
|
||||
backend_reset_peak_memory_stats,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
is_torch_compile,
|
||||
load_image,
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_2,
|
||||
require_torch_accelerator,
|
||||
run_test_in_subprocess,
|
||||
skip_mps,
|
||||
slow,
|
||||
torch_device,
|
||||
@@ -66,6 +70,38 @@ from ..test_pipelines_common import (
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
# Will be run via run_test_in_subprocess
|
||||
def _test_img2img_compile(in_queue, out_queue, timeout):
|
||||
error = None
|
||||
try:
|
||||
inputs = in_queue.get(timeout=timeout)
|
||||
torch_device = inputs.pop("torch_device")
|
||||
seed = inputs.pop("seed")
|
||||
inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
|
||||
|
||||
pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.unet.set_default_attn_processor()
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.unet.to(memory_format=torch.channels_last)
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
image = pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 768, 3)
|
||||
expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806])
|
||||
|
||||
assert np.abs(expected_slice - image_slice).max() < 1e-3
|
||||
except Exception:
|
||||
error = f"{traceback.format_exc()}"
|
||||
|
||||
results = {"error": error}
|
||||
out_queue.put(results, timeout=timeout)
|
||||
out_queue.join()
|
||||
|
||||
|
||||
class StableDiffusionImg2ImgPipelineFastTests(
|
||||
IPAdapterTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
@@ -618,6 +654,17 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
|
||||
assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
|
||||
assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros
|
||||
|
||||
@is_torch_compile
|
||||
@require_torch_2
|
||||
def test_img2img_compile(self):
|
||||
seed = 0
|
||||
inputs = self.get_inputs(torch_device, seed=seed)
|
||||
# Can't pickle a Generator object
|
||||
del inputs["generator"]
|
||||
inputs["torch_device"] = torch_device
|
||||
inputs["seed"] = seed
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
import gc
|
||||
import random
|
||||
import traceback
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -43,10 +44,13 @@ from diffusers.utils.testing_utils import (
|
||||
backend_reset_peak_memory_stats,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
is_torch_compile,
|
||||
load_image,
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_2,
|
||||
require_torch_accelerator,
|
||||
run_test_in_subprocess,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
@@ -67,6 +71,40 @@ from ..test_pipelines_common import (
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
# Will be run via run_test_in_subprocess
|
||||
def _test_inpaint_compile(in_queue, out_queue, timeout):
|
||||
error = None
|
||||
try:
|
||||
inputs = in_queue.get(timeout=timeout)
|
||||
torch_device = inputs.pop("torch_device")
|
||||
seed = inputs.pop("seed")
|
||||
inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
|
||||
|
||||
pipe = StableDiffusionInpaintPipeline.from_pretrained(
|
||||
"botp/stable-diffusion-v1-5-inpainting", safety_checker=None
|
||||
)
|
||||
pipe.unet.set_default_attn_processor()
|
||||
pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe.unet.to(memory_format=torch.channels_last)
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
image = pipe(**inputs).images
|
||||
image_slice = image[0, 253:256, 253:256, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.0689, 0.0699, 0.0790, 0.0536, 0.0470, 0.0488, 0.041, 0.0508, 0.04179])
|
||||
assert np.abs(expected_slice - image_slice).max() < 3e-3
|
||||
except Exception:
|
||||
error = f"{traceback.format_exc()}"
|
||||
|
||||
results = {"error": error}
|
||||
out_queue.put(results, timeout=timeout)
|
||||
out_queue.join()
|
||||
|
||||
|
||||
class StableDiffusionInpaintPipelineFastTests(
|
||||
IPAdapterTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
@@ -689,6 +727,17 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
|
||||
# make sure that less than 2.2 GB is allocated
|
||||
assert mem_bytes < 2.2 * 10**9
|
||||
|
||||
@is_torch_compile
|
||||
@require_torch_2
|
||||
def test_inpaint_compile(self):
|
||||
seed = 0
|
||||
inputs = self.get_inputs(torch_device, seed=seed)
|
||||
# Can't pickle a Generator object
|
||||
del inputs["generator"]
|
||||
inputs["torch_device"] = torch_device
|
||||
inputs["seed"] = seed
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs)
|
||||
|
||||
def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
|
||||
pipe = StableDiffusionInpaintPipeline.from_pretrained(
|
||||
"botp/stable-diffusion-v1-5-inpainting", safety_checker=None
|
||||
@@ -915,6 +964,11 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
|
||||
# make sure that less than 2.45 GB is allocated
|
||||
assert mem_bytes < 2.45 * 10**9
|
||||
|
||||
@is_torch_compile
|
||||
@require_torch_2
|
||||
def test_inpaint_compile(self):
|
||||
pass
|
||||
|
||||
def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
|
||||
vae = AsymmetricAutoencoderKL.from_pretrained(
|
||||
"cross-attention/asymmetric-autoencoder-kl-x-1-5",
|
||||
|
||||
@@ -1994,9 +1994,7 @@ class PipelineSlowTests(unittest.TestCase):
|
||||
reason="Torch Dynamo isn't yet supported for Python 3.12.",
|
||||
)
|
||||
def test_from_save_pretrained_dynamo(self):
|
||||
torch.compiler.rest()
|
||||
with torch._inductor.utils.fresh_inductor_cache():
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None)
|
||||
|
||||
def test_from_pretrained_hub(self):
|
||||
model_path = "google/ddpm-cifar10-32"
|
||||
@@ -2208,7 +2206,7 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
|
||||
# It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model,
|
||||
# there will be recompilation errors, as torch caches the model when run in the same process.
|
||||
super().tearDown()
|
||||
torch.compiler.reset()
|
||||
torch._dynamo.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
@@ -2333,21 +2331,21 @@ class TestLoraHotSwappingForPipeline(unittest.TestCase):
|
||||
def test_hotswapping_compiled_pipline_linear(self, rank0, rank1):
|
||||
# It's important to add this context to raise an error on recompilation
|
||||
target_modules = ["to_q", "to_k", "to_v", "to_out.0"]
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
|
||||
with torch._dynamo.config.patch(error_on_recompile=True):
|
||||
self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
|
||||
|
||||
@parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa
|
||||
def test_hotswapping_compiled_pipline_conv2d(self, rank0, rank1):
|
||||
# It's important to add this context to raise an error on recompilation
|
||||
target_modules = ["conv", "conv1", "conv2"]
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
|
||||
with torch._dynamo.config.patch(error_on_recompile=True):
|
||||
self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
|
||||
|
||||
@parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa
|
||||
def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1):
|
||||
# It's important to add this context to raise an error on recompilation
|
||||
target_modules = ["to_q", "conv"]
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache():
|
||||
with torch._dynamo.config.patch(error_on_recompile=True):
|
||||
self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
|
||||
|
||||
def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
|
||||
|
||||
@@ -1111,14 +1111,14 @@ class PipelineTesterMixin:
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
torch.compiler.reset()
|
||||
torch._dynamo.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test in case of CUDA runtime errors
|
||||
super().tearDown()
|
||||
torch.compiler.reset()
|
||||
torch._dynamo.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import gc
|
||||
import random
|
||||
import traceback
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -26,7 +27,9 @@ from diffusers.utils.testing_utils import (
|
||||
floats_tensor,
|
||||
load_image,
|
||||
nightly,
|
||||
require_torch_2,
|
||||
require_torch_accelerator,
|
||||
run_test_in_subprocess,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
@@ -42,6 +45,38 @@ from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, Pipeline
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
# Will be run via run_test_in_subprocess
|
||||
def _test_unidiffuser_compile(in_queue, out_queue, timeout):
|
||||
error = None
|
||||
try:
|
||||
inputs = in_queue.get(timeout=timeout)
|
||||
torch_device = inputs.pop("torch_device")
|
||||
seed = inputs.pop("seed")
|
||||
inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)
|
||||
|
||||
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
|
||||
# pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
pipe = pipe.to(torch_device)
|
||||
|
||||
pipe.unet.to(memory_format=torch.channels_last)
|
||||
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
image = pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
|
||||
assert np.abs(image_slice - expected_slice).max() < 1e-1
|
||||
except Exception:
|
||||
error = f"{traceback.format_exc()}"
|
||||
|
||||
results = {"error": error}
|
||||
out_queue.put(results, timeout=timeout)
|
||||
out_queue.join()
|
||||
|
||||
|
||||
class UniDiffuserPipelineFastTests(
|
||||
PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
|
||||
):
|
||||
@@ -655,6 +690,19 @@ class UniDiffuserPipelineSlowTests(unittest.TestCase):
|
||||
expected_text_prefix = "An astronaut"
|
||||
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
|
||||
|
||||
@unittest.skip(reason="Skip torch.compile test to speed up the slow test suite.")
|
||||
@require_torch_2
|
||||
def test_unidiffuser_compile(self, seed=0):
|
||||
inputs = self.get_inputs(torch_device, seed=seed, generate_latents=True)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
# Can't pickle a Generator object
|
||||
del inputs["generator"]
|
||||
inputs["torch_device"] = torch_device
|
||||
inputs["seed"] = seed
|
||||
run_test_in_subprocess(test_case=self, target_func=_test_unidiffuser_compile, inputs=inputs)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
|
||||
Reference in New Issue
Block a user