mirror of
https://github.com/huggingface/diffusers.git
synced 2026-02-25 04:10:34 +08:00
Compare commits
19 Commits
pin-transf
...
deprecate-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0e0dc27488 | ||
|
|
df0cc8469c | ||
|
|
1f6ac1c3d1 | ||
|
|
5e94d62eb4 | ||
|
|
7ab2011759 | ||
|
|
4890e9bf70 | ||
|
|
f1e5914120 | ||
|
|
a80b19218b | ||
|
|
01de02e8b4 | ||
|
|
db2d7e7bc4 | ||
|
|
f8d3db9ca7 | ||
|
|
99daaa802d | ||
|
|
fe78a7b7c6 | ||
|
|
53e1d0e458 | ||
|
|
a577ec36df | ||
|
|
6875490c3b | ||
|
|
64734b2115 | ||
|
|
f81e653197 | ||
|
|
bcbbded7c3 |
13
.github/workflows/pr_tests.yml
vendored
13
.github/workflows/pr_tests.yml
vendored
@@ -92,7 +92,6 @@ jobs:
|
||||
runner: aws-general-8-plus
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_example_cpu
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on:
|
||||
@@ -115,8 +114,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -e ".[quality]"
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
|
||||
- name: Environment
|
||||
@@ -218,8 +216,6 @@ jobs:
|
||||
|
||||
run_lora_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
|
||||
name: LoRA tests with PEFT main
|
||||
|
||||
@@ -247,9 +243,8 @@ jobs:
|
||||
uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
|
||||
uv pip install -U tokenizers
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -275,6 +270,6 @@ jobs:
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: pr_main_test_reports
|
||||
name: pr_lora_test_reports
|
||||
path: reports
|
||||
|
||||
|
||||
14
.github/workflows/pr_tests_gpu.yml
vendored
14
.github/workflows/pr_tests_gpu.yml
vendored
@@ -131,8 +131,7 @@ jobs:
|
||||
run: |
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -199,16 +198,10 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
# Install pkgs which depend on setuptools<81 for pkg_resources first with no build isolation
|
||||
uv pip install pip==25.2 setuptools==80.10.2
|
||||
uv pip install --no-build-isolation k-diffusion==0.0.12
|
||||
uv pip install --upgrade pip setuptools
|
||||
# Install the rest as normal
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -269,8 +262,7 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip install -e ".[quality,training]"
|
||||
|
||||
- name: Environment
|
||||
|
||||
15
.github/workflows/push_tests.yml
vendored
15
.github/workflows/push_tests.yml
vendored
@@ -76,9 +76,7 @@ jobs:
|
||||
run: |
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers
|
||||
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -127,16 +125,10 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
# Install pkgs which depend on setuptools<81 for pkg_resources first with no build isolation
|
||||
uv pip install pip==25.2 setuptools==80.10.2
|
||||
uv pip install --no-build-isolation k-diffusion==0.0.12
|
||||
uv pip install --upgrade pip setuptools
|
||||
# Install the rest as normal
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -188,8 +180,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -e ".[quality,training]"
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
2
.github/workflows/push_tests_mps.yml
vendored
2
.github/workflows/push_tests_mps.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip uv
|
||||
${CONDA_RUN} python -m uv pip install -e ".[quality,test]"
|
||||
${CONDA_RUN} python -m uv pip install -e ".[quality]"
|
||||
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
|
||||
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
${CONDA_RUN} python -m uv pip install transformers --upgrade
|
||||
|
||||
@@ -29,7 +29,7 @@ Qwen-Image comes in the following variants:
|
||||
| Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) |
|
||||
|
||||
> [!TIP]
|
||||
> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
|
||||
> See the [Caching](../../optimization/cache) guide to speed up inference by storing and reusing intermediate outputs.
|
||||
|
||||
## LoRA for faster inference
|
||||
|
||||
@@ -190,6 +190,12 @@ For detailed benchmark scripts and results, see [this gist](https://gist.github.
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## QwenImageLayeredPipeline
|
||||
|
||||
[[autodoc]] QwenImageLayeredPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## QwenImagePipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
|
||||
@@ -17,6 +17,9 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from diffusers.utils import is_transformers_version
|
||||
|
||||
|
||||
sys.path.append("..")
|
||||
@@ -30,6 +33,7 @@ stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
|
||||
@unittest.skipIf(is_transformers_version(">=", "4.57.5"), "Size mismatch")
|
||||
class CustomDiffusion(ExamplesTestsAccelerate):
|
||||
def test_custom_diffusion(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
4
setup.py
4
setup.py
@@ -101,6 +101,7 @@ _deps = [
|
||||
"datasets",
|
||||
"filelock",
|
||||
"flax>=0.4.1",
|
||||
"ftfy",
|
||||
"hf-doc-builder>=0.3.0",
|
||||
"httpx<1.0.0",
|
||||
"huggingface-hub>=0.34.0,<2.0",
|
||||
@@ -221,12 +222,14 @@ extras["docs"] = deps_list("hf-doc-builder")
|
||||
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft", "timm")
|
||||
extras["test"] = deps_list(
|
||||
"compel",
|
||||
"ftfy",
|
||||
"GitPython",
|
||||
"datasets",
|
||||
"Jinja2",
|
||||
"invisible-watermark",
|
||||
"librosa",
|
||||
"parameterized",
|
||||
"protobuf",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
"pytest-xdist",
|
||||
@@ -235,6 +238,7 @@ extras["test"] = deps_list(
|
||||
"sentencepiece",
|
||||
"scipy",
|
||||
"tiktoken",
|
||||
"torchsde",
|
||||
"torchvision",
|
||||
"transformers",
|
||||
"phonemizer",
|
||||
|
||||
@@ -8,6 +8,7 @@ deps = {
|
||||
"datasets": "datasets",
|
||||
"filelock": "filelock",
|
||||
"flax": "flax>=0.4.1",
|
||||
"ftfy": "ftfy",
|
||||
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
||||
"httpx": "httpx<1.0.0",
|
||||
"huggingface-hub": "huggingface-hub>=0.34.0,<2.0",
|
||||
|
||||
@@ -48,6 +48,7 @@ _GO_LC_SUPPORTED_PYTORCH_LAYERS = (
|
||||
torch.nn.ConvTranspose2d,
|
||||
torch.nn.ConvTranspose3d,
|
||||
torch.nn.Linear,
|
||||
torch.nn.Embedding,
|
||||
# TODO(aryan): look into torch.nn.LayerNorm, torch.nn.GroupNorm later, seems to be causing some issues with CogVideoX
|
||||
# because of double invocation of the same norm layer in CogVideoXLayerNorm
|
||||
)
|
||||
|
||||
@@ -5472,6 +5472,10 @@ class Flux2LoraLoaderMixin(LoraBaseMixin):
|
||||
logger.warning(warn_msg)
|
||||
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
|
||||
|
||||
is_peft_format = any(k.startswith("base_model.model.") for k in state_dict)
|
||||
if is_peft_format:
|
||||
state_dict = {k.replace("base_model.model.", "diffusion_model."): v for k, v in state_dict.items()}
|
||||
|
||||
is_ai_toolkit = any(k.startswith("diffusion_model.") for k in state_dict)
|
||||
if is_ai_toolkit:
|
||||
state_dict = _convert_non_diffusers_flux2_lora_to_diffusers(state_dict)
|
||||
|
||||
@@ -22,7 +22,12 @@ from tokenizers import Tokenizer as TokenizerFast
|
||||
from torch import nn
|
||||
|
||||
from ..models.modeling_utils import load_state_dict
|
||||
from ..utils import _get_model_file, is_accelerate_available, is_transformers_available, logging
|
||||
from ..utils import (
|
||||
_get_model_file,
|
||||
is_accelerate_available,
|
||||
is_transformers_available,
|
||||
logging,
|
||||
)
|
||||
|
||||
|
||||
if is_transformers_available():
|
||||
|
||||
@@ -62,6 +62,8 @@ _REQUIRED_FLEX_VERSION = "2.5.0"
|
||||
_REQUIRED_XLA_VERSION = "2.2"
|
||||
_REQUIRED_XFORMERS_VERSION = "0.0.29"
|
||||
|
||||
logger = get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
_CAN_USE_FLASH_ATTN = is_flash_attn_available() and is_flash_attn_version(">=", _REQUIRED_FLASH_VERSION)
|
||||
_CAN_USE_FLASH_ATTN_3 = is_flash_attn_3_available()
|
||||
_CAN_USE_AITER_ATTN = is_aiter_available() and is_aiter_version(">=", _REQUIRED_AITER_VERSION)
|
||||
@@ -73,8 +75,18 @@ _CAN_USE_XFORMERS_ATTN = is_xformers_available() and is_xformers_version(">=", _
|
||||
|
||||
|
||||
if _CAN_USE_FLASH_ATTN:
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward
|
||||
try:
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
# Handle ABI mismatch or other import failures gracefully.
|
||||
# This can happen when flash_attn was compiled against a different PyTorch version.
|
||||
logger.warning(f"flash_attn is installed but failed to import: {e}. Falling back to native PyTorch attention.")
|
||||
_CAN_USE_FLASH_ATTN = False
|
||||
flash_attn_func = None
|
||||
flash_attn_varlen_func = None
|
||||
_wrapped_flash_attn_backward = None
|
||||
_wrapped_flash_attn_forward = None
|
||||
else:
|
||||
flash_attn_func = None
|
||||
flash_attn_varlen_func = None
|
||||
@@ -83,26 +95,47 @@ else:
|
||||
|
||||
|
||||
if _CAN_USE_FLASH_ATTN_3:
|
||||
from flash_attn_interface import flash_attn_func as flash_attn_3_func
|
||||
from flash_attn_interface import flash_attn_varlen_func as flash_attn_3_varlen_func
|
||||
try:
|
||||
from flash_attn_interface import flash_attn_func as flash_attn_3_func
|
||||
from flash_attn_interface import flash_attn_varlen_func as flash_attn_3_varlen_func
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"flash_attn_3 failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_FLASH_ATTN_3 = False
|
||||
flash_attn_3_func = None
|
||||
flash_attn_3_varlen_func = None
|
||||
else:
|
||||
flash_attn_3_func = None
|
||||
flash_attn_3_varlen_func = None
|
||||
|
||||
if _CAN_USE_AITER_ATTN:
|
||||
from aiter import flash_attn_func as aiter_flash_attn_func
|
||||
try:
|
||||
from aiter import flash_attn_func as aiter_flash_attn_func
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"aiter failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_AITER_ATTN = False
|
||||
aiter_flash_attn_func = None
|
||||
else:
|
||||
aiter_flash_attn_func = None
|
||||
|
||||
if _CAN_USE_SAGE_ATTN:
|
||||
from sageattention import (
|
||||
sageattn,
|
||||
sageattn_qk_int8_pv_fp8_cuda,
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90,
|
||||
sageattn_qk_int8_pv_fp16_cuda,
|
||||
sageattn_qk_int8_pv_fp16_triton,
|
||||
sageattn_varlen,
|
||||
)
|
||||
try:
|
||||
from sageattention import (
|
||||
sageattn,
|
||||
sageattn_qk_int8_pv_fp8_cuda,
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90,
|
||||
sageattn_qk_int8_pv_fp16_cuda,
|
||||
sageattn_qk_int8_pv_fp16_triton,
|
||||
sageattn_varlen,
|
||||
)
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"sageattention failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_SAGE_ATTN = False
|
||||
sageattn = None
|
||||
sageattn_qk_int8_pv_fp8_cuda = None
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90 = None
|
||||
sageattn_qk_int8_pv_fp16_cuda = None
|
||||
sageattn_qk_int8_pv_fp16_triton = None
|
||||
sageattn_varlen = None
|
||||
else:
|
||||
sageattn = None
|
||||
sageattn_qk_int8_pv_fp16_cuda = None
|
||||
@@ -113,26 +146,48 @@ else:
|
||||
|
||||
|
||||
if _CAN_USE_FLEX_ATTN:
|
||||
# We cannot import the flex_attention function from the package directly because it is expected (from the
|
||||
# pytorch documentation) that the user may compile it. If we import directly, we will not have access to the
|
||||
# compiled function.
|
||||
import torch.nn.attention.flex_attention as flex_attention
|
||||
try:
|
||||
# We cannot import the flex_attention function from the package directly because it is expected (from the
|
||||
# pytorch documentation) that the user may compile it. If we import directly, we will not have access to the
|
||||
# compiled function.
|
||||
import torch.nn.attention.flex_attention as flex_attention
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"flex_attention failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_FLEX_ATTN = False
|
||||
flex_attention = None
|
||||
else:
|
||||
flex_attention = None
|
||||
|
||||
|
||||
if _CAN_USE_NPU_ATTN:
|
||||
from torch_npu import npu_fusion_attention
|
||||
try:
|
||||
from torch_npu import npu_fusion_attention
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"torch_npu failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_NPU_ATTN = False
|
||||
npu_fusion_attention = None
|
||||
else:
|
||||
npu_fusion_attention = None
|
||||
|
||||
|
||||
if _CAN_USE_XLA_ATTN:
|
||||
from torch_xla.experimental.custom_kernel import flash_attention as xla_flash_attention
|
||||
try:
|
||||
from torch_xla.experimental.custom_kernel import flash_attention as xla_flash_attention
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"torch_xla failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_XLA_ATTN = False
|
||||
xla_flash_attention = None
|
||||
else:
|
||||
xla_flash_attention = None
|
||||
|
||||
|
||||
if _CAN_USE_XFORMERS_ATTN:
|
||||
import xformers.ops as xops
|
||||
try:
|
||||
import xformers.ops as xops
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"xformers failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_XFORMERS_ATTN = False
|
||||
xops = None
|
||||
else:
|
||||
xops = None
|
||||
|
||||
@@ -158,8 +213,6 @@ else:
|
||||
_register_fake = register_fake_no_op
|
||||
|
||||
|
||||
logger = get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
# TODO(aryan): Add support for the following:
|
||||
# - Sage Attention++
|
||||
# - block sparse, radial and other attention methods
|
||||
@@ -266,6 +319,10 @@ class _HubKernelConfig:
|
||||
function_attr: str
|
||||
revision: str | None = None
|
||||
kernel_fn: Callable | None = None
|
||||
wrapped_forward_attr: str | None = None
|
||||
wrapped_backward_attr: str | None = None
|
||||
wrapped_forward_fn: Callable | None = None
|
||||
wrapped_backward_fn: Callable | None = None
|
||||
|
||||
|
||||
# Registry for hub-based attention kernels
|
||||
@@ -280,7 +337,11 @@ _HUB_KERNELS_REGISTRY: dict["AttentionBackendName", _HubKernelConfig] = {
|
||||
# revision="fake-ops-return-probs",
|
||||
),
|
||||
AttentionBackendName.FLASH_HUB: _HubKernelConfig(
|
||||
repo_id="kernels-community/flash-attn2", function_attr="flash_attn_func", revision=None
|
||||
repo_id="kernels-community/flash-attn2",
|
||||
function_attr="flash_attn_func",
|
||||
revision=None,
|
||||
wrapped_forward_attr="flash_attn_interface._wrapped_flash_attn_forward",
|
||||
wrapped_backward_attr="flash_attn_interface._wrapped_flash_attn_backward",
|
||||
),
|
||||
AttentionBackendName.FLASH_VARLEN_HUB: _HubKernelConfig(
|
||||
repo_id="kernels-community/flash-attn2", function_attr="flash_attn_varlen_func", revision=None
|
||||
@@ -605,22 +666,39 @@ def _flex_attention_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
|
||||
|
||||
|
||||
# ===== Helpers for downloading kernels =====
|
||||
def _resolve_kernel_attr(module, attr_path: str):
|
||||
target = module
|
||||
for attr in attr_path.split("."):
|
||||
if not hasattr(target, attr):
|
||||
raise AttributeError(f"Kernel module '{module.__name__}' does not define attribute path '{attr_path}'.")
|
||||
target = getattr(target, attr)
|
||||
return target
|
||||
|
||||
|
||||
def _maybe_download_kernel_for_backend(backend: AttentionBackendName) -> None:
|
||||
if backend not in _HUB_KERNELS_REGISTRY:
|
||||
return
|
||||
config = _HUB_KERNELS_REGISTRY[backend]
|
||||
|
||||
if config.kernel_fn is not None:
|
||||
needs_kernel = config.kernel_fn is None
|
||||
needs_wrapped_forward = config.wrapped_forward_attr is not None and config.wrapped_forward_fn is None
|
||||
needs_wrapped_backward = config.wrapped_backward_attr is not None and config.wrapped_backward_fn is None
|
||||
|
||||
if not (needs_kernel or needs_wrapped_forward or needs_wrapped_backward):
|
||||
return
|
||||
|
||||
try:
|
||||
from kernels import get_kernel
|
||||
|
||||
kernel_module = get_kernel(config.repo_id, revision=config.revision)
|
||||
kernel_func = getattr(kernel_module, config.function_attr)
|
||||
if needs_kernel:
|
||||
config.kernel_fn = _resolve_kernel_attr(kernel_module, config.function_attr)
|
||||
|
||||
# Cache the downloaded kernel function in the config object
|
||||
config.kernel_fn = kernel_func
|
||||
if needs_wrapped_forward:
|
||||
config.wrapped_forward_fn = _resolve_kernel_attr(kernel_module, config.wrapped_forward_attr)
|
||||
|
||||
if needs_wrapped_backward:
|
||||
config.wrapped_backward_fn = _resolve_kernel_attr(kernel_module, config.wrapped_backward_attr)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred while fetching kernel '{config.repo_id}' from the Hub: {e}")
|
||||
@@ -1071,6 +1149,237 @@ def _flash_attention_backward_op(
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _flash_attention_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for flash-attn hub kernels.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for flash-attn hub kernels.")
|
||||
|
||||
config = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_HUB]
|
||||
wrapped_forward_fn = config.wrapped_forward_fn
|
||||
wrapped_backward_fn = config.wrapped_backward_fn
|
||||
if wrapped_forward_fn is None or wrapped_backward_fn is None:
|
||||
raise RuntimeError(
|
||||
"Flash attention hub kernels must expose `_wrapped_flash_attn_forward` and `_wrapped_flash_attn_backward` "
|
||||
"for context parallel execution."
|
||||
)
|
||||
|
||||
if scale is None:
|
||||
scale = query.shape[-1] ** (-0.5)
|
||||
|
||||
window_size = (-1, -1)
|
||||
softcap = 0.0
|
||||
alibi_slopes = None
|
||||
deterministic = False
|
||||
grad_enabled = any(x.requires_grad for x in (query, key, value))
|
||||
|
||||
if grad_enabled or (_parallel_config is not None and _parallel_config.context_parallel_config._world_size > 1):
|
||||
dropout_p = dropout_p if dropout_p > 0 else 1e-30
|
||||
|
||||
with torch.set_grad_enabled(grad_enabled):
|
||||
out, lse, S_dmask, rng_state = wrapped_forward_fn(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
dropout_p,
|
||||
scale,
|
||||
is_causal,
|
||||
window_size[0],
|
||||
window_size[1],
|
||||
softcap,
|
||||
alibi_slopes,
|
||||
return_lse,
|
||||
)
|
||||
lse = lse.permute(0, 2, 1).contiguous()
|
||||
|
||||
if _save_ctx:
|
||||
ctx.save_for_backward(query, key, value, out, lse, rng_state)
|
||||
ctx.dropout_p = dropout_p
|
||||
ctx.scale = scale
|
||||
ctx.is_causal = is_causal
|
||||
ctx.window_size = window_size
|
||||
ctx.softcap = softcap
|
||||
ctx.alibi_slopes = alibi_slopes
|
||||
ctx.deterministic = deterministic
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _flash_attention_hub_backward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
grad_out: torch.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
config = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_HUB]
|
||||
wrapped_backward_fn = config.wrapped_backward_fn
|
||||
if wrapped_backward_fn is None:
|
||||
raise RuntimeError(
|
||||
"Flash attention hub kernels must expose `_wrapped_flash_attn_backward` for context parallel execution."
|
||||
)
|
||||
|
||||
query, key, value, out, lse, rng_state = ctx.saved_tensors
|
||||
grad_query, grad_key, grad_value = torch.empty_like(query), torch.empty_like(key), torch.empty_like(value)
|
||||
|
||||
_ = wrapped_backward_fn(
|
||||
grad_out,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
out,
|
||||
lse,
|
||||
grad_query,
|
||||
grad_key,
|
||||
grad_value,
|
||||
ctx.dropout_p,
|
||||
ctx.scale,
|
||||
ctx.is_causal,
|
||||
ctx.window_size[0],
|
||||
ctx.window_size[1],
|
||||
ctx.softcap,
|
||||
ctx.alibi_slopes,
|
||||
ctx.deterministic,
|
||||
rng_state,
|
||||
)
|
||||
|
||||
grad_query = grad_query[..., : grad_out.shape[-1]]
|
||||
grad_key = grad_key[..., : grad_out.shape[-1]]
|
||||
grad_value = grad_value[..., : grad_out.shape[-1]]
|
||||
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _flash_attention_3_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
*,
|
||||
window_size: tuple[int, int] = (-1, -1),
|
||||
softcap: float = 0.0,
|
||||
num_splits: int = 1,
|
||||
pack_gqa: bool | None = None,
|
||||
deterministic: bool = False,
|
||||
sm_margin: int = 0,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for flash-attn 3 hub kernels.")
|
||||
if dropout_p != 0.0:
|
||||
raise ValueError("`dropout_p` is not yet supported for flash-attn 3 hub kernels.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for flash-attn 3 hub kernels.")
|
||||
|
||||
func = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB].kernel_fn
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
qv=None,
|
||||
q_descale=None,
|
||||
k_descale=None,
|
||||
v_descale=None,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=num_splits,
|
||||
pack_gqa=pack_gqa,
|
||||
deterministic=deterministic,
|
||||
sm_margin=sm_margin,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
|
||||
lse = None
|
||||
if return_lse:
|
||||
out, lse = out
|
||||
lse = lse.permute(0, 2, 1).contiguous()
|
||||
|
||||
if _save_ctx:
|
||||
ctx.save_for_backward(query, key, value)
|
||||
ctx.scale = scale
|
||||
ctx.is_causal = is_causal
|
||||
ctx._hub_kernel = func
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _flash_attention_3_hub_backward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
grad_out: torch.Tensor,
|
||||
*args,
|
||||
window_size: tuple[int, int] = (-1, -1),
|
||||
softcap: float = 0.0,
|
||||
num_splits: int = 1,
|
||||
pack_gqa: bool | None = None,
|
||||
deterministic: bool = False,
|
||||
sm_margin: int = 0,
|
||||
):
|
||||
query, key, value = ctx.saved_tensors
|
||||
kernel_fn = ctx._hub_kernel
|
||||
# NOTE: Unlike the FA2 hub kernel, the FA3 hub kernel does not expose separate wrapped forward/backward
|
||||
# primitives (no `wrapped_forward_attr`/`wrapped_backward_attr` in its `_HubKernelConfig`). We
|
||||
# therefore rerun the forward pass under `torch.enable_grad()` and differentiate through it with
|
||||
# `torch.autograd.grad()`. This is a second forward pass during backward; it can be avoided once
|
||||
# the FA3 hub exposes a dedicated fused backward kernel (analogous to `_wrapped_flash_attn_backward`
|
||||
# in the FA2 hub), at which point this can be refactored to match `_flash_attention_hub_backward_op`.
|
||||
with torch.enable_grad():
|
||||
query_r = query.detach().requires_grad_(True)
|
||||
key_r = key.detach().requires_grad_(True)
|
||||
value_r = value.detach().requires_grad_(True)
|
||||
|
||||
out = kernel_fn(
|
||||
q=query_r,
|
||||
k=key_r,
|
||||
v=value_r,
|
||||
softmax_scale=ctx.scale,
|
||||
causal=ctx.is_causal,
|
||||
qv=None,
|
||||
q_descale=None,
|
||||
k_descale=None,
|
||||
v_descale=None,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=num_splits,
|
||||
pack_gqa=pack_gqa,
|
||||
deterministic=deterministic,
|
||||
sm_margin=sm_margin,
|
||||
return_attn_probs=False,
|
||||
)
|
||||
if isinstance(out, tuple):
|
||||
out = out[0]
|
||||
|
||||
grad_query, grad_key, grad_value = torch.autograd.grad(
|
||||
out,
|
||||
(query_r, key_r, value_r),
|
||||
grad_out,
|
||||
retain_graph=False,
|
||||
allow_unused=False,
|
||||
)
|
||||
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _sage_attention_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
@@ -1109,6 +1418,46 @@ def _sage_attention_forward_op(
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _sage_attention_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for Sage attention.")
|
||||
if dropout_p > 0.0:
|
||||
raise ValueError("`dropout_p` is not yet supported for Sage attention.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for Sage attention.")
|
||||
|
||||
func = _HUB_KERNELS_REGISTRY[AttentionBackendName.SAGE_HUB].kernel_fn
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
|
||||
lse = None
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
lse = lse.permute(0, 2, 1).contiguous()
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _sage_attention_backward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
grad_out: torch.Tensor,
|
||||
@@ -1117,6 +1466,26 @@ def _sage_attention_backward_op(
|
||||
raise NotImplementedError("Backward pass is not implemented for Sage attention.")
|
||||
|
||||
|
||||
def _maybe_modify_attn_mask_npu(query: torch.Tensor, key: torch.Tensor, attn_mask: torch.Tensor | None = None):
|
||||
# Skip Attention Mask if all values are 1, `None` mask can speedup the computation
|
||||
if attn_mask is not None and torch.all(attn_mask != 0):
|
||||
attn_mask = None
|
||||
|
||||
# Reshape Attention Mask: [batch_size, seq_len_k] -> [batch_size, 1, sqe_len_q, seq_len_k]
|
||||
# https://www.hiascend.com/document/detail/zh/Pytorch/730/apiref/torchnpuCustomsapi/docs/context/torch_npu-npu_fusion_attention.md
|
||||
if (
|
||||
attn_mask is not None
|
||||
and attn_mask.ndim == 2
|
||||
and attn_mask.shape[0] == query.shape[0]
|
||||
and attn_mask.shape[1] == key.shape[1]
|
||||
):
|
||||
B, Sq, Skv = attn_mask.shape[0], query.shape[1], key.shape[1]
|
||||
attn_mask = ~attn_mask.to(torch.bool)
|
||||
attn_mask = attn_mask.unsqueeze(1).expand(B, Sq, Skv).unsqueeze(1).contiguous()
|
||||
|
||||
return attn_mask
|
||||
|
||||
|
||||
def _npu_attention_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
@@ -1134,11 +1503,14 @@ def _npu_attention_forward_op(
|
||||
if return_lse:
|
||||
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
|
||||
|
||||
attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
|
||||
|
||||
out = npu_fusion_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query.size(2), # num_heads
|
||||
atten_mask=attn_mask,
|
||||
input_layout="BSND",
|
||||
pse=None,
|
||||
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
|
||||
@@ -1942,7 +2314,7 @@ def _flash_attention(
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.FLASH_HUB,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=False,
|
||||
supports_context_parallel=True,
|
||||
)
|
||||
def _flash_attention_hub(
|
||||
query: torch.Tensor,
|
||||
@@ -1960,17 +2332,35 @@ def _flash_attention_hub(
|
||||
raise ValueError("`attn_mask` is not supported for flash-attn 2.")
|
||||
|
||||
func = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_HUB].kernel_fn
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
dropout_p=dropout_p,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
if _parallel_config is None:
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
dropout_p=dropout_p,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
else:
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
dropout_p,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_lse,
|
||||
forward_op=_flash_attention_hub_forward_op,
|
||||
backward_op=_flash_attention_hub_backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse = out
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
@@ -2117,7 +2507,7 @@ def _flash_attention_3(
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._FLASH_3_HUB,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=False,
|
||||
supports_context_parallel=True,
|
||||
)
|
||||
def _flash_attention_3_hub(
|
||||
query: torch.Tensor,
|
||||
@@ -2132,33 +2522,68 @@ def _flash_attention_3_hub(
|
||||
return_attn_probs: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if _parallel_config:
|
||||
raise NotImplementedError(f"{AttentionBackendName._FLASH_3_HUB.value} is not implemented for parallelism yet.")
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for flash-attn 3.")
|
||||
|
||||
func = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB].kernel_fn
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
qv=None,
|
||||
q_descale=None,
|
||||
k_descale=None,
|
||||
v_descale=None,
|
||||
if _parallel_config is None:
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
qv=None,
|
||||
q_descale=None,
|
||||
k_descale=None,
|
||||
v_descale=None,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=1,
|
||||
pack_gqa=None,
|
||||
deterministic=deterministic,
|
||||
sm_margin=0,
|
||||
return_attn_probs=return_attn_probs,
|
||||
)
|
||||
return (out[0], out[1]) if return_attn_probs else out
|
||||
|
||||
forward_op = functools.partial(
|
||||
_flash_attention_3_hub_forward_op,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=1,
|
||||
pack_gqa=None,
|
||||
deterministic=deterministic,
|
||||
sm_margin=0,
|
||||
return_attn_probs=return_attn_probs,
|
||||
)
|
||||
# When `return_attn_probs` is True, the above returns a tuple of
|
||||
# actual outputs and lse.
|
||||
return (out[0], out[1]) if return_attn_probs else out
|
||||
backward_op = functools.partial(
|
||||
_flash_attention_3_hub_backward_op,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=1,
|
||||
pack_gqa=None,
|
||||
deterministic=deterministic,
|
||||
sm_margin=0,
|
||||
)
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
0.0,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_attn_probs,
|
||||
forward_op=forward_op,
|
||||
backward_op=backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_attn_probs:
|
||||
out, lse = out
|
||||
return out, lse
|
||||
|
||||
return out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
@@ -2668,16 +3093,17 @@ def _native_npu_attention(
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for NPU attention")
|
||||
if return_lse:
|
||||
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
|
||||
if _parallel_config is None:
|
||||
attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
|
||||
|
||||
out = npu_fusion_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query.size(2), # num_heads
|
||||
atten_mask=attn_mask,
|
||||
input_layout="BSND",
|
||||
pse=None,
|
||||
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
|
||||
@@ -2692,7 +3118,7 @@ def _native_npu_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
attn_mask,
|
||||
dropout_p,
|
||||
None,
|
||||
scale,
|
||||
@@ -2789,7 +3215,7 @@ def _sage_attention(
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.SAGE_HUB,
|
||||
constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=False,
|
||||
supports_context_parallel=True,
|
||||
)
|
||||
def _sage_attention_hub(
|
||||
query: torch.Tensor,
|
||||
@@ -2817,6 +3243,23 @@ def _sage_attention_hub(
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
else:
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
0.0,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_lse,
|
||||
forward_op=_sage_attention_hub_forward_op,
|
||||
backward_op=_sage_attention_backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse = out
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@@ -30,10 +30,126 @@ class AutoModel(ConfigMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise EnvironmentError(
|
||||
f"{self.__class__.__name__} is designed to be instantiated "
|
||||
f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
|
||||
f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`, "
|
||||
f"`{self.__class__.__name__}.from_config(config)`, or "
|
||||
f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, pretrained_model_name_or_path_or_dict: str | os.PathLike | dict | None = None, **kwargs):
|
||||
r"""
|
||||
Instantiate a model from a config dictionary or a pretrained model configuration file with random weights (no
|
||||
pretrained weights are loaded).
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path_or_dict (`str`, `os.PathLike`, or `dict`):
|
||||
Can be either:
|
||||
|
||||
- A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model
|
||||
configuration hosted on the Hub.
|
||||
- A path to a *directory* (for example `./my_model_directory`) containing a model configuration
|
||||
file.
|
||||
- A config dictionary.
|
||||
|
||||
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
||||
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
|
||||
is not used.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to force the (re-)download of the model configuration, overriding the cached version if
|
||||
it exists.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint.
|
||||
local_files_only(`bool`, *optional*, defaults to `False`):
|
||||
Whether to only load local model configuration files or not.
|
||||
token (`str` or *bool*, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files.
|
||||
revision (`str`, *optional*, defaults to `"main"`):
|
||||
The specific model version to use.
|
||||
trust_remote_code (`bool`, *optional*, defaults to `False`):
|
||||
Whether to trust remote code.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
The subfolder location of a model file within a larger model repository on the Hub or locally.
|
||||
|
||||
Returns:
|
||||
A model object instantiated from the config with random weights.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
from diffusers import AutoModel
|
||||
|
||||
model = AutoModel.from_config("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
|
||||
```
|
||||
"""
|
||||
subfolder = kwargs.pop("subfolder", None)
|
||||
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
||||
|
||||
hub_kwargs_names = [
|
||||
"cache_dir",
|
||||
"force_download",
|
||||
"local_files_only",
|
||||
"proxies",
|
||||
"revision",
|
||||
"token",
|
||||
]
|
||||
hub_kwargs = {name: kwargs.pop(name, None) for name in hub_kwargs_names}
|
||||
|
||||
if pretrained_model_name_or_path_or_dict is None:
|
||||
raise ValueError(
|
||||
"Please provide a `pretrained_model_name_or_path_or_dict` as the first positional argument."
|
||||
)
|
||||
|
||||
if isinstance(pretrained_model_name_or_path_or_dict, (str, os.PathLike)):
|
||||
pretrained_model_name_or_path = pretrained_model_name_or_path_or_dict
|
||||
config = cls.load_config(pretrained_model_name_or_path, subfolder=subfolder, **hub_kwargs)
|
||||
else:
|
||||
config = pretrained_model_name_or_path_or_dict
|
||||
pretrained_model_name_or_path = config.get("_name_or_path", None)
|
||||
|
||||
has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"]
|
||||
trust_remote_code = resolve_trust_remote_code(
|
||||
trust_remote_code, pretrained_model_name_or_path, has_remote_code
|
||||
)
|
||||
|
||||
if has_remote_code and trust_remote_code:
|
||||
class_ref = config["auto_map"][cls.__name__]
|
||||
module_file, class_name = class_ref.split(".")
|
||||
module_file = module_file + ".py"
|
||||
model_cls = get_class_from_dynamic_module(
|
||||
pretrained_model_name_or_path,
|
||||
subfolder=subfolder,
|
||||
module_file=module_file,
|
||||
class_name=class_name,
|
||||
**hub_kwargs,
|
||||
)
|
||||
else:
|
||||
if "_class_name" in config:
|
||||
class_name = config["_class_name"]
|
||||
library = "diffusers"
|
||||
elif "model_type" in config:
|
||||
class_name = "AutoModel"
|
||||
library = "transformers"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Couldn't find a model class associated with the config: {config}. Make sure the config "
|
||||
"contains a `_class_name` or `model_type` key."
|
||||
)
|
||||
|
||||
from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates
|
||||
|
||||
model_cls, _ = get_class_obj_and_candidates(
|
||||
library_name=library,
|
||||
class_name=class_name,
|
||||
importable_classes=ALL_IMPORTABLE_CLASSES,
|
||||
pipelines=None,
|
||||
is_pipeline_module=False,
|
||||
)
|
||||
|
||||
if model_cls is None:
|
||||
raise ValueError(f"AutoModel can't find a model linked to {class_name}.")
|
||||
|
||||
return model_cls.from_config(config, **kwargs)
|
||||
|
||||
@classmethod
|
||||
@validate_hf_hub_args
|
||||
def from_pretrained(cls, pretrained_model_or_path: str | os.PathLike | None = None, **kwargs):
|
||||
|
||||
@@ -424,7 +424,7 @@ class Flux2SingleTransformerBlock(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor | None,
|
||||
temb_mod_params: tuple[torch.Tensor, torch.Tensor, torch.Tensor],
|
||||
temb_mod: torch.Tensor,
|
||||
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
|
||||
joint_attention_kwargs: dict[str, Any] | None = None,
|
||||
split_hidden_states: bool = False,
|
||||
@@ -436,7 +436,7 @@ class Flux2SingleTransformerBlock(nn.Module):
|
||||
text_seq_len = encoder_hidden_states.shape[1]
|
||||
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
|
||||
|
||||
mod_shift, mod_scale, mod_gate = temb_mod_params
|
||||
mod_shift, mod_scale, mod_gate = Flux2Modulation.split(temb_mod, 1)[0]
|
||||
|
||||
norm_hidden_states = self.norm(hidden_states)
|
||||
norm_hidden_states = (1 + mod_scale) * norm_hidden_states + mod_shift
|
||||
@@ -498,16 +498,18 @@ class Flux2TransformerBlock(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor,
|
||||
temb_mod_params_img: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
|
||||
temb_mod_params_txt: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
|
||||
temb_mod_img: torch.Tensor,
|
||||
temb_mod_txt: torch.Tensor,
|
||||
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
|
||||
joint_attention_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
joint_attention_kwargs = joint_attention_kwargs or {}
|
||||
|
||||
# Modulation parameters shape: [1, 1, self.dim]
|
||||
(shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = temb_mod_params_img
|
||||
(c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = temb_mod_params_txt
|
||||
(shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = Flux2Modulation.split(temb_mod_img, 2)
|
||||
(c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = Flux2Modulation.split(
|
||||
temb_mod_txt, 2
|
||||
)
|
||||
|
||||
# Img stream
|
||||
norm_hidden_states = self.norm1(hidden_states)
|
||||
@@ -627,15 +629,19 @@ class Flux2Modulation(nn.Module):
|
||||
self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias)
|
||||
self.act_fn = nn.SiLU()
|
||||
|
||||
def forward(self, temb: torch.Tensor) -> tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
|
||||
def forward(self, temb: torch.Tensor) -> torch.Tensor:
|
||||
mod = self.act_fn(temb)
|
||||
mod = self.linear(mod)
|
||||
return mod
|
||||
|
||||
@staticmethod
|
||||
# split inside the transformer blocks, to avoid passing tuples into checkpoints https://github.com/huggingface/diffusers/issues/12776
|
||||
def split(mod: torch.Tensor, mod_param_sets: int) -> tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
|
||||
if mod.ndim == 2:
|
||||
mod = mod.unsqueeze(1)
|
||||
mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1)
|
||||
mod_params = torch.chunk(mod, 3 * mod_param_sets, dim=-1)
|
||||
# Return tuple of 3-tuples of modulation params shift/scale/gate
|
||||
return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets))
|
||||
return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(mod_param_sets))
|
||||
|
||||
|
||||
class Flux2Transformer2DModel(
|
||||
@@ -824,7 +830,7 @@ class Flux2Transformer2DModel(
|
||||
|
||||
double_stream_mod_img = self.double_stream_modulation_img(temb)
|
||||
double_stream_mod_txt = self.double_stream_modulation_txt(temb)
|
||||
single_stream_mod = self.single_stream_modulation(temb)[0]
|
||||
single_stream_mod = self.single_stream_modulation(temb)
|
||||
|
||||
# 2. Input projection for image (hidden_states) and conditioning text (encoder_hidden_states)
|
||||
hidden_states = self.x_embedder(hidden_states)
|
||||
@@ -861,8 +867,8 @@ class Flux2Transformer2DModel(
|
||||
encoder_hidden_states, hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb_mod_params_img=double_stream_mod_img,
|
||||
temb_mod_params_txt=double_stream_mod_txt,
|
||||
temb_mod_img=double_stream_mod_img,
|
||||
temb_mod_txt=double_stream_mod_txt,
|
||||
image_rotary_emb=concat_rotary_emb,
|
||||
joint_attention_kwargs=joint_attention_kwargs,
|
||||
)
|
||||
@@ -884,7 +890,7 @@ class Flux2Transformer2DModel(
|
||||
hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=None,
|
||||
temb_mod_params=single_stream_mod,
|
||||
temb_mod=single_stream_mod,
|
||||
image_rotary_emb=concat_rotary_emb,
|
||||
joint_attention_kwargs=joint_attention_kwargs,
|
||||
)
|
||||
|
||||
@@ -164,7 +164,11 @@ def compute_text_seq_len_from_mask(
|
||||
position_ids = torch.arange(text_seq_len, device=encoder_hidden_states.device, dtype=torch.long)
|
||||
active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
|
||||
has_active = encoder_hidden_states_mask.any(dim=1)
|
||||
per_sample_len = torch.where(has_active, active_positions.max(dim=1).values + 1, torch.as_tensor(text_seq_len))
|
||||
per_sample_len = torch.where(
|
||||
has_active,
|
||||
active_positions.max(dim=1).values + 1,
|
||||
torch.as_tensor(text_seq_len, device=encoder_hidden_states.device),
|
||||
)
|
||||
return text_seq_len, per_sample_len, encoder_hidden_states_mask
|
||||
|
||||
|
||||
|
||||
@@ -502,6 +502,10 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
||||
text_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
)
|
||||
# Extract the pooler output if it's a BaseModelOutputWithPooling (Transformers v5+)
|
||||
# otherwise use it directly (Transformers v4)
|
||||
if hasattr(prompt_embeds, "pooler_output"):
|
||||
prompt_embeds = prompt_embeds.pooler_output
|
||||
# append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
|
||||
prompt_embeds = prompt_embeds[:, None, :]
|
||||
# make sure that we attend to this single hidden-state
|
||||
@@ -610,6 +614,10 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
||||
uncond_input_ids,
|
||||
attention_mask=negative_attention_mask,
|
||||
)
|
||||
# Extract the pooler output if it's a BaseModelOutputWithPooling (Transformers v5+)
|
||||
# otherwise use it directly (Transformers v4)
|
||||
if hasattr(negative_prompt_embeds, "pooler_output"):
|
||||
negative_prompt_embeds = negative_prompt_embeds.pooler_output
|
||||
# append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
|
||||
negative_prompt_embeds = negative_prompt_embeds[:, None, :]
|
||||
# make sure that we attend to this single hidden-state
|
||||
|
||||
@@ -287,6 +287,9 @@ class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
)
|
||||
input_ids = (
|
||||
input_ids["input_ids"] if not isinstance(input_ids, list) and "input_ids" in input_ids else input_ids
|
||||
)
|
||||
input_ids = torch.LongTensor(input_ids)
|
||||
input_ids_batch.append(input_ids)
|
||||
|
||||
|
||||
@@ -262,6 +262,9 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
)
|
||||
input_ids = (
|
||||
input_ids["input_ids"] if not isinstance(input_ids, list) and "input_ids" in input_ids else input_ids
|
||||
)
|
||||
input_ids = torch.LongTensor(input_ids)
|
||||
input_ids_batch.append(input_ids)
|
||||
|
||||
|
||||
@@ -20,6 +20,8 @@ class MultilingualCLIP(PreTrainedModel):
|
||||
self.LinearTransformation = torch.nn.Linear(
|
||||
in_features=config.transformerDimensions, out_features=config.numDims
|
||||
)
|
||||
if hasattr(self, "post_init"):
|
||||
self.post_init()
|
||||
|
||||
def forward(self, input_ids, attention_mask):
|
||||
embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
|
||||
|
||||
@@ -781,6 +781,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
||||
self.prefix_encoder = PrefixEncoder(config)
|
||||
self.dropout = torch.nn.Dropout(0.1)
|
||||
|
||||
if hasattr(self, "post_init"):
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embedding.word_embeddings
|
||||
|
||||
@@ -810,7 +813,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
use_cache = use_cache if use_cache is not None else getattr(self.config, "use_cache", None)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
batch_size, seq_length = input_ids.shape
|
||||
|
||||
@@ -341,6 +341,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
|
||||
save_method_accept_variant = "variant" in save_method_signature.parameters
|
||||
save_method_accept_max_shard_size = "max_shard_size" in save_method_signature.parameters
|
||||
save_method_accept_peft_format = "save_peft_format" in save_method_signature.parameters
|
||||
|
||||
save_kwargs = {}
|
||||
if save_method_accept_safe:
|
||||
@@ -350,6 +351,11 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
if save_method_accept_max_shard_size and max_shard_size is not None:
|
||||
# max_shard_size is expected to not be None in ModelMixin
|
||||
save_kwargs["max_shard_size"] = max_shard_size
|
||||
if save_method_accept_peft_format:
|
||||
# Set save_peft_format=False for transformers>=5.0.0 compatibility
|
||||
# In transformers 5.0.0+, the default save_peft_format=True adds "base_model.model" prefix
|
||||
# to adapter keys, but from_pretrained expects keys without this prefix
|
||||
save_kwargs["save_peft_format"] = False
|
||||
|
||||
save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
|
||||
|
||||
|
||||
@@ -24,14 +24,25 @@ except OptionalDependencyNotAvailable:
|
||||
else:
|
||||
_import_structure["pipeline_prx"] = ["PRXPipeline"]
|
||||
|
||||
# Import T5GemmaEncoder for pipeline loading compatibility
|
||||
# Wrap T5GemmaEncoder to pass config.encoder (T5GemmaModuleConfig) instead of the
|
||||
# composite T5GemmaConfig, which lacks flat attributes expected by T5GemmaEncoder.__init__.
|
||||
try:
|
||||
if is_transformers_available():
|
||||
import transformers
|
||||
from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
|
||||
from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder as _T5GemmaEncoder
|
||||
|
||||
class T5GemmaEncoder(_T5GemmaEncoder):
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
|
||||
if "config" not in kwargs:
|
||||
from transformers.models.t5gemma.configuration_t5gemma import T5GemmaConfig
|
||||
|
||||
config = T5GemmaConfig.from_pretrained(pretrained_model_name_or_path)
|
||||
if hasattr(config, "encoder"):
|
||||
kwargs["config"] = config.encoder
|
||||
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
|
||||
|
||||
_additional_imports["T5GemmaEncoder"] = T5GemmaEncoder
|
||||
# Patch transformers module directly for serialization
|
||||
if not hasattr(transformers, "T5GemmaEncoder"):
|
||||
transformers.T5GemmaEncoder = T5GemmaEncoder
|
||||
except ImportError:
|
||||
|
||||
@@ -18,7 +18,6 @@ import re
|
||||
import urllib.parse as ul
|
||||
from typing import Callable
|
||||
|
||||
import ftfy
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
@@ -34,13 +33,13 @@ from diffusers.models.transformers.transformer_prx import PRXTransformer2DModel
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.prx.pipeline_output import PRXPipelineOutput
|
||||
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
||||
from diffusers.utils import (
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
)
|
||||
from diffusers.utils import is_ftfy_available, logging, replace_example_docstring
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
|
||||
if is_ftfy_available():
|
||||
import ftfy
|
||||
|
||||
DEFAULT_RESOLUTION = 512
|
||||
|
||||
ASPECT_RATIO_256_BIN = {
|
||||
|
||||
@@ -17,7 +17,7 @@ from typing import Any, Callable
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...loaders import SkyReelsV2LoraLoaderMixin
|
||||
@@ -132,7 +132,7 @@ class SkyReelsV2Pipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixin):
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -19,7 +19,7 @@ from copy import deepcopy
|
||||
from typing import Any, Callable
|
||||
|
||||
import torch
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...loaders import SkyReelsV2LoraLoaderMixin
|
||||
@@ -153,7 +153,7 @@ class SkyReelsV2DiffusionForcingPipeline(DiffusionPipeline, SkyReelsV2LoraLoader
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -20,7 +20,7 @@ from typing import Any, Callable
|
||||
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
|
||||
from diffusers.image_processor import PipelineImageInput
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
@@ -158,7 +158,7 @@ class SkyReelsV2DiffusionForcingImageToVideoPipeline(DiffusionPipeline, SkyReels
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -21,7 +21,7 @@ from typing import Any, Callable
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...loaders import SkyReelsV2LoraLoaderMixin
|
||||
@@ -214,7 +214,7 @@ class SkyReelsV2DiffusionForcingVideoToVideoPipeline(DiffusionPipeline, SkyReels
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -18,7 +18,7 @@ from typing import Any, Callable
|
||||
import PIL
|
||||
import regex as re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPProcessor, CLIPVisionModelWithProjection, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPProcessor, CLIPVisionModelWithProjection, T5EncoderModel, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput
|
||||
@@ -157,7 +157,7 @@ class SkyReelsV2ImageToVideoPipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixi
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
image_encoder: CLIPVisionModelWithProjection,
|
||||
image_processor: CLIPProcessor,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
|
||||
@@ -112,6 +112,8 @@ def _load_transformers_model_from_dduf(
|
||||
tensors = safetensors.torch.load(mmap)
|
||||
# Update the state dictionary with tensors
|
||||
state_dict.update(tensors)
|
||||
# `from_pretrained` sets the model to eval mode by default, which is the
|
||||
# correct behavior for inference. Do not call `model.train()` here.
|
||||
return cls.from_pretrained(
|
||||
pretrained_model_name_or_path=None,
|
||||
config=config,
|
||||
|
||||
@@ -516,6 +516,9 @@ def dequantize_gguf_tensor(tensor):
|
||||
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
|
||||
# Conver to plain tensor to avoid unnecessary __torch_function__ overhead.
|
||||
tensor = tensor.as_tensor()
|
||||
|
||||
tensor = tensor.view(torch.uint8)
|
||||
shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
|
||||
|
||||
@@ -525,7 +528,7 @@ def dequantize_gguf_tensor(tensor):
|
||||
dequant = dequant_fn(blocks, block_size, type_size)
|
||||
dequant = dequant.reshape(shape)
|
||||
|
||||
return dequant.as_tensor()
|
||||
return dequant
|
||||
|
||||
|
||||
class GGUFParameter(torch.nn.Parameter):
|
||||
|
||||
@@ -36,7 +36,7 @@ from typing import Any, Callable
|
||||
|
||||
from packaging import version
|
||||
|
||||
from ..utils import is_torch_available, is_torchao_available, is_torchao_version, logging
|
||||
from ..utils import deprecate, is_torch_available, is_torchao_available, is_torchao_version, logging
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -844,6 +844,8 @@ class QuantoConfig(QuantizationConfigMixin):
|
||||
modules_to_not_convert: list[str] | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
deprecation_message = "`QuantoConfig` is deprecated and will be removed in version 1.0.0."
|
||||
deprecate("QuantoConfig", "1.0.0", deprecation_message)
|
||||
self.quant_method = QuantizationMethod.QUANTO
|
||||
self.weights_dtype = weights_dtype
|
||||
self.modules_to_not_convert = modules_to_not_convert
|
||||
|
||||
@@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any
|
||||
from diffusers.utils.import_utils import is_optimum_quanto_version
|
||||
|
||||
from ...utils import (
|
||||
deprecate,
|
||||
get_module_from_name,
|
||||
is_accelerate_available,
|
||||
is_accelerate_version,
|
||||
@@ -42,6 +43,9 @@ class QuantoQuantizer(DiffusersQuantizer):
|
||||
super().__init__(quantization_config, **kwargs)
|
||||
|
||||
def validate_environment(self, *args, **kwargs):
|
||||
deprecation_message = "The Quanto quantizer is deprecated and will be removed in version 1.0.0."
|
||||
deprecate("QuantoQuantizer", "1.0.0", deprecation_message)
|
||||
|
||||
if not is_optimum_quanto_available():
|
||||
raise ImportError(
|
||||
"Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)"
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -41,7 +42,7 @@ class FlowMatchLCMSchedulerOutput(BaseOutput):
|
||||
denoising loop.
|
||||
"""
|
||||
|
||||
prev_sample: torch.FloatTensor
|
||||
prev_sample: torch.Tensor
|
||||
|
||||
|
||||
class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
@@ -79,11 +80,11 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
use_beta_sigmas (`bool`, defaults to False):
|
||||
Whether to use beta sigmas for step sizes in the noise schedule during sampling.
|
||||
time_shift_type (`str`, defaults to "exponential"):
|
||||
The type of dynamic resolution-dependent timestep shifting to apply. Either "exponential" or "linear".
|
||||
scale_factors ('list', defaults to None)
|
||||
The type of dynamic resolution-dependent timestep shifting to apply.
|
||||
scale_factors (`list[float]`, *optional*, defaults to `None`):
|
||||
It defines how to scale the latents at which predictions are made.
|
||||
upscale_mode ('str', defaults to 'bicubic')
|
||||
Upscaling method, applied if scale-wise generation is considered
|
||||
upscale_mode (`str`, *optional*, defaults to "bicubic"):
|
||||
Upscaling method, applied if scale-wise generation is considered.
|
||||
"""
|
||||
|
||||
_compatibles = []
|
||||
@@ -101,16 +102,33 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
max_image_seq_len: int = 4096,
|
||||
invert_sigmas: bool = False,
|
||||
shift_terminal: float | None = None,
|
||||
use_karras_sigmas: bool = False,
|
||||
use_exponential_sigmas: bool = False,
|
||||
use_beta_sigmas: bool = False,
|
||||
time_shift_type: str = "exponential",
|
||||
use_karras_sigmas: bool | None = False,
|
||||
use_exponential_sigmas: bool | None = False,
|
||||
use_beta_sigmas: bool | None = False,
|
||||
time_shift_type: Literal["exponential", "linear"] = "exponential",
|
||||
scale_factors: list[float] | None = None,
|
||||
upscale_mode: str = "bicubic",
|
||||
upscale_mode: Literal[
|
||||
"nearest",
|
||||
"linear",
|
||||
"bilinear",
|
||||
"bicubic",
|
||||
"trilinear",
|
||||
"area",
|
||||
"nearest-exact",
|
||||
] = "bicubic",
|
||||
):
|
||||
if self.config.use_beta_sigmas and not is_scipy_available():
|
||||
raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
|
||||
if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
|
||||
if (
|
||||
sum(
|
||||
[
|
||||
self.config.use_beta_sigmas,
|
||||
self.config.use_exponential_sigmas,
|
||||
self.config.use_karras_sigmas,
|
||||
]
|
||||
)
|
||||
> 1
|
||||
):
|
||||
raise ValueError(
|
||||
"Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
|
||||
)
|
||||
@@ -162,7 +180,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self._begin_index
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
|
||||
def set_begin_index(self, begin_index: int = 0):
|
||||
def set_begin_index(self, begin_index: int = 0) -> None:
|
||||
"""
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
@@ -172,18 +190,18 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
|
||||
def set_shift(self, shift: float):
|
||||
def set_shift(self, shift: float) -> None:
|
||||
self._shift = shift
|
||||
|
||||
def set_scale_factors(self, scale_factors: list, upscale_mode):
|
||||
def set_scale_factors(self, scale_factors: list[float], upscale_mode: str) -> None:
|
||||
"""
|
||||
Sets scale factors for a scale-wise generation regime.
|
||||
|
||||
Args:
|
||||
scale_factors (`list`):
|
||||
The scale factors for each step
|
||||
scale_factors (`list[float]`):
|
||||
The scale factors for each step.
|
||||
upscale_mode (`str`):
|
||||
Upscaling method
|
||||
Upscaling method.
|
||||
"""
|
||||
self._scale_factors = scale_factors
|
||||
self._upscale_mode = upscale_mode
|
||||
@@ -238,16 +256,18 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return sample
|
||||
|
||||
def _sigma_to_t(self, sigma):
|
||||
def _sigma_to_t(self, sigma: float | torch.FloatTensor) -> float | torch.FloatTensor:
|
||||
return sigma * self.config.num_train_timesteps
|
||||
|
||||
def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
|
||||
def time_shift(
|
||||
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
|
||||
) -> float | np.ndarray | torch.Tensor:
|
||||
if self.config.time_shift_type == "exponential":
|
||||
return self._time_shift_exponential(mu, sigma, t)
|
||||
elif self.config.time_shift_type == "linear":
|
||||
return self._time_shift_linear(mu, sigma, t)
|
||||
|
||||
def stretch_shift_to_terminal(self, t: torch.Tensor) -> torch.Tensor:
|
||||
def stretch_shift_to_terminal(self, t: np.ndarray | torch.Tensor) -> np.ndarray | torch.Tensor:
|
||||
r"""
|
||||
Stretches and shifts the timestep schedule to ensure it terminates at the configured `shift_terminal` config
|
||||
value.
|
||||
@@ -256,12 +276,13 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
https://github.com/Lightricks/LTX-Video/blob/a01a171f8fe3d99dce2728d60a73fecf4d4238ae/ltx_video/schedulers/rf.py#L51
|
||||
|
||||
Args:
|
||||
t (`torch.Tensor`):
|
||||
A tensor of timesteps to be stretched and shifted.
|
||||
t (`torch.Tensor` or `np.ndarray`):
|
||||
A tensor or numpy array of timesteps to be stretched and shifted.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
A tensor of adjusted timesteps such that the final value equals `self.config.shift_terminal`.
|
||||
`torch.Tensor` or `np.ndarray`:
|
||||
A tensor or numpy array of adjusted timesteps such that the final value equals
|
||||
`self.config.shift_terminal`.
|
||||
"""
|
||||
one_minus_z = 1 - t
|
||||
scale_factor = one_minus_z[-1] / (1 - self.config.shift_terminal)
|
||||
@@ -270,12 +291,12 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: int = None,
|
||||
device: str | torch.device = None,
|
||||
num_inference_steps: int | None = None,
|
||||
device: str | torch.device | None = None,
|
||||
sigmas: list[float] | None = None,
|
||||
mu: float = None,
|
||||
mu: float | None = None,
|
||||
timesteps: list[float] | None = None,
|
||||
):
|
||||
) -> None:
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
@@ -317,43 +338,45 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
is_timesteps_provided = timesteps is not None
|
||||
|
||||
if is_timesteps_provided:
|
||||
timesteps = np.array(timesteps).astype(np.float32)
|
||||
timesteps = np.array(timesteps).astype(np.float32) # type: ignore
|
||||
|
||||
if sigmas is None:
|
||||
if timesteps is None:
|
||||
timesteps = np.linspace(
|
||||
self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
|
||||
timesteps = np.linspace( # type: ignore
|
||||
self._sigma_to_t(self.sigma_max),
|
||||
self._sigma_to_t(self.sigma_min),
|
||||
num_inference_steps,
|
||||
)
|
||||
sigmas = timesteps / self.config.num_train_timesteps
|
||||
sigmas = timesteps / self.config.num_train_timesteps # type: ignore
|
||||
else:
|
||||
sigmas = np.array(sigmas).astype(np.float32)
|
||||
sigmas = np.array(sigmas).astype(np.float32) # type: ignore
|
||||
num_inference_steps = len(sigmas)
|
||||
|
||||
# 2. Perform timestep shifting. Either no shifting is applied, or resolution-dependent shifting of
|
||||
# "exponential" or "linear" type is applied
|
||||
if self.config.use_dynamic_shifting:
|
||||
sigmas = self.time_shift(mu, 1.0, sigmas)
|
||||
sigmas = self.time_shift(mu, 1.0, sigmas) # type: ignore
|
||||
else:
|
||||
sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas)
|
||||
sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas) # type: ignore
|
||||
|
||||
# 3. If required, stretch the sigmas schedule to terminate at the configured `shift_terminal` value
|
||||
if self.config.shift_terminal:
|
||||
sigmas = self.stretch_shift_to_terminal(sigmas)
|
||||
sigmas = self.stretch_shift_to_terminal(sigmas) # type: ignore
|
||||
|
||||
# 4. If required, convert sigmas to one of karras, exponential, or beta sigma schedules
|
||||
if self.config.use_karras_sigmas:
|
||||
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
|
||||
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
|
||||
elif self.config.use_exponential_sigmas:
|
||||
sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
|
||||
sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
|
||||
elif self.config.use_beta_sigmas:
|
||||
sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
|
||||
sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
|
||||
|
||||
# 5. Convert sigmas and timesteps to tensors and move to specified device
|
||||
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
|
||||
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) # type: ignore
|
||||
if not is_timesteps_provided:
|
||||
timesteps = sigmas * self.config.num_train_timesteps
|
||||
timesteps = sigmas * self.config.num_train_timesteps # type: ignore
|
||||
else:
|
||||
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device)
|
||||
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device) # type: ignore
|
||||
|
||||
# 6. Append the terminal sigma value.
|
||||
# If a model requires inverted sigma schedule for denoising but timesteps without inversion, the
|
||||
@@ -370,7 +393,11 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._step_index = None
|
||||
self._begin_index = None
|
||||
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self,
|
||||
timestep: float | torch.Tensor,
|
||||
schedule_timesteps: torch.Tensor | None = None,
|
||||
) -> int:
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -382,9 +409,9 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
# case we start in the middle of the denoising schedule (e.g. for image-to-image)
|
||||
pos = 1 if len(indices) > 1 else 0
|
||||
|
||||
return indices[pos].item()
|
||||
return int(indices[pos].item())
|
||||
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: float | torch.Tensor) -> None:
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -459,7 +486,12 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
size = [round(self._scale_factors[self._step_index] * size) for size in self._init_size]
|
||||
x0_pred = torch.nn.functional.interpolate(x0_pred, size=size, mode=self._upscale_mode)
|
||||
|
||||
noise = randn_tensor(x0_pred.shape, generator=generator, device=x0_pred.device, dtype=x0_pred.dtype)
|
||||
noise = randn_tensor(
|
||||
x0_pred.shape,
|
||||
generator=generator,
|
||||
device=x0_pred.device,
|
||||
dtype=x0_pred.dtype,
|
||||
)
|
||||
prev_sample = (1 - sigma_next) * x0_pred + sigma_next * noise
|
||||
|
||||
# upon completion increase step index by one
|
||||
@@ -473,7 +505,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return FlowMatchLCMSchedulerOutput(prev_sample=prev_sample)
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
@@ -594,11 +626,15 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
)
|
||||
return sigmas
|
||||
|
||||
def _time_shift_exponential(self, mu, sigma, t):
|
||||
def _time_shift_exponential(
|
||||
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
|
||||
) -> float | np.ndarray | torch.Tensor:
|
||||
return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
|
||||
|
||||
def _time_shift_linear(self, mu, sigma, t):
|
||||
def _time_shift_linear(
|
||||
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
|
||||
) -> float | np.ndarray | torch.Tensor:
|
||||
return mu / (mu + (1 / t - 1) ** sigma)
|
||||
|
||||
def __len__(self):
|
||||
def __len__(self) -> int:
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from transformers import CLIPTextModel, LongformerModel
|
||||
|
||||
@@ -20,7 +20,9 @@ class TestAutoModel(unittest.TestCase):
|
||||
side_effect=[EnvironmentError("File not found"), {"model_type": "clip_text_model"}],
|
||||
)
|
||||
def test_load_from_config_transformers_with_subfolder(self, mock_load_config):
|
||||
model = AutoModel.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder")
|
||||
model = AutoModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder", use_safetensors=False
|
||||
)
|
||||
assert isinstance(model, CLIPTextModel)
|
||||
|
||||
def test_load_from_config_without_subfolder(self):
|
||||
@@ -28,5 +30,73 @@ class TestAutoModel(unittest.TestCase):
|
||||
assert isinstance(model, LongformerModel)
|
||||
|
||||
def test_load_from_model_index(self):
|
||||
model = AutoModel.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder")
|
||||
model = AutoModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder", use_safetensors=False
|
||||
)
|
||||
assert isinstance(model, CLIPTextModel)
|
||||
|
||||
|
||||
class TestAutoModelFromConfig(unittest.TestCase):
|
||||
@patch(
|
||||
"diffusers.pipelines.pipeline_loading_utils.get_class_obj_and_candidates",
|
||||
return_value=(MagicMock(), None),
|
||||
)
|
||||
def test_from_config_with_dict_diffusers_class(self, mock_get_class):
|
||||
config = {"_class_name": "UNet2DConditionModel", "sample_size": 64}
|
||||
mock_model = MagicMock()
|
||||
mock_get_class.return_value[0].from_config.return_value = mock_model
|
||||
|
||||
result = AutoModel.from_config(config)
|
||||
|
||||
mock_get_class.assert_called_once_with(
|
||||
library_name="diffusers",
|
||||
class_name="UNet2DConditionModel",
|
||||
importable_classes=unittest.mock.ANY,
|
||||
pipelines=None,
|
||||
is_pipeline_module=False,
|
||||
)
|
||||
mock_get_class.return_value[0].from_config.assert_called_once_with(config)
|
||||
assert result is mock_model
|
||||
|
||||
@patch(
|
||||
"diffusers.pipelines.pipeline_loading_utils.get_class_obj_and_candidates",
|
||||
return_value=(MagicMock(), None),
|
||||
)
|
||||
@patch("diffusers.models.AutoModel.load_config", return_value={"_class_name": "UNet2DConditionModel"})
|
||||
def test_from_config_with_string_path(self, mock_load_config, mock_get_class):
|
||||
mock_model = MagicMock()
|
||||
mock_get_class.return_value[0].from_config.return_value = mock_model
|
||||
|
||||
result = AutoModel.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet")
|
||||
|
||||
mock_load_config.assert_called_once()
|
||||
assert result is mock_model
|
||||
|
||||
def test_from_config_raises_on_missing_class_info(self):
|
||||
config = {"some_key": "some_value"}
|
||||
with self.assertRaises(ValueError, msg="Couldn't find a model class"):
|
||||
AutoModel.from_config(config)
|
||||
|
||||
@patch(
|
||||
"diffusers.pipelines.pipeline_loading_utils.get_class_obj_and_candidates",
|
||||
return_value=(MagicMock(), None),
|
||||
)
|
||||
def test_from_config_with_model_type_routes_to_transformers(self, mock_get_class):
|
||||
config = {"model_type": "clip_text_model"}
|
||||
mock_model = MagicMock()
|
||||
mock_get_class.return_value[0].from_config.return_value = mock_model
|
||||
|
||||
result = AutoModel.from_config(config)
|
||||
|
||||
mock_get_class.assert_called_once_with(
|
||||
library_name="transformers",
|
||||
class_name="AutoModel",
|
||||
importable_classes=unittest.mock.ANY,
|
||||
pipelines=None,
|
||||
is_pipeline_module=False,
|
||||
)
|
||||
assert result is mock_model
|
||||
|
||||
def test_from_config_raises_on_none(self):
|
||||
with self.assertRaises(ValueError, msg="Please provide a `pretrained_model_name_or_path_or_dict`"):
|
||||
AutoModel.from_config(None)
|
||||
|
||||
@@ -375,7 +375,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
# additionally check if dynamic compilation works.
|
||||
if different_shapes is not None:
|
||||
for height, width in different_shapes:
|
||||
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
|
||||
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
|
||||
_ = model(**new_inputs_dict)
|
||||
else:
|
||||
output0_after = model(**inputs_dict)["sample"]
|
||||
@@ -390,7 +390,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
with torch.inference_mode():
|
||||
if different_shapes is not None:
|
||||
for height, width in different_shapes:
|
||||
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
|
||||
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
|
||||
_ = model(**new_inputs_dict)
|
||||
else:
|
||||
output1_after = model(**inputs_dict)["sample"]
|
||||
|
||||
@@ -282,6 +282,8 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
clap_prompt_embeds = audioldm_pipe.text_encoder.get_text_features(text_inputs)
|
||||
if hasattr(clap_prompt_embeds, "pooler_output"):
|
||||
clap_prompt_embeds = clap_prompt_embeds.pooler_output
|
||||
clap_prompt_embeds = clap_prompt_embeds[:, None, :]
|
||||
|
||||
text_inputs = audioldm_pipe.tokenizer_2(
|
||||
@@ -341,6 +343,8 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
clap_prompt_embeds = audioldm_pipe.text_encoder.get_text_features(text_inputs)
|
||||
if hasattr(clap_prompt_embeds, "pooler_output"):
|
||||
clap_prompt_embeds = clap_prompt_embeds.pooler_output
|
||||
clap_prompt_embeds = clap_prompt_embeds[:, None, :]
|
||||
|
||||
text_inputs = audioldm_pipe.tokenizer_2(
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import T5EncoderModel, T5TokenizerFast
|
||||
from transformers import AutoConfig, T5EncoderModel, T5TokenizerFast
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -89,7 +89,8 @@ class BriaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = T5TokenizerFast.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -2,7 +2,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, ChromaPipeline, ChromaTransformer2DModel, FlowMatchEulerDiscreteScheduler
|
||||
|
||||
@@ -41,7 +41,8 @@ class ChromaPipelineFastTests(
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, ChromaImg2ImgPipeline, ChromaTransformer2DModel, FlowMatchEulerDiscreteScheduler
|
||||
|
||||
@@ -42,7 +42,8 @@ class ChromaImg2ImgPipelineFastTests(
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ import unittest
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPImageProcessor,
|
||||
CLIPVisionConfig,
|
||||
@@ -71,7 +72,8 @@ class ChronoEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
torch.manual_seed(0)
|
||||
# TODO: impl FlowDPMSolverMultistepScheduler
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransformer3DModel, DDIMScheduler
|
||||
|
||||
@@ -117,7 +117,8 @@ class CogVideoXPipelineFastTests(
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -235,6 +236,9 @@ class CogVideoXPipelineFastTests(
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXFunControlPipeline, CogVideoXTransformer3DModel, DDIMScheduler
|
||||
|
||||
@@ -104,7 +104,8 @@ class CogVideoXFunControlPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -228,6 +229,9 @@ class CogVideoXFunControlPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
@@ -113,7 +113,8 @@ class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -237,6 +238,9 @@ class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXVideoToVideoPipeline, DDIMScheduler
|
||||
|
||||
@@ -99,7 +99,8 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, CogVideoXDDIMScheduler, CogView3PlusPipeline, CogView3PlusTransformer2DModel
|
||||
|
||||
@@ -89,7 +89,8 @@ class CogView3PlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = CogVideoXDDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -108,7 +108,7 @@ class CogView4PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "dance monkey",
|
||||
"negative_prompt": "",
|
||||
"negative_prompt": "bad",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
@@ -122,7 +122,8 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -248,6 +249,9 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
|
||||
from transformers import AutoConfig, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -97,7 +97,8 @@ class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin, Fl
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = T5TokenizerFast.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -2,7 +2,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -13,9 +13,7 @@ from diffusers import (
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...testing_utils import (
|
||||
torch_device,
|
||||
)
|
||||
from ...testing_utils import torch_device
|
||||
from ..test_pipelines_common import PipelineTesterMixin, check_qkv_fused_layers_exist
|
||||
|
||||
|
||||
@@ -70,7 +68,8 @@ class FluxControlNetImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMi
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,15 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
# torch_device, # {{ edit_1 }} Removed unused import
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -22,11 +14,7 @@ from diffusers import (
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
torch_device,
|
||||
)
|
||||
from ...testing_utils import enable_full_determinism, floats_tensor, torch_device
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
@@ -85,7 +73,8 @@ class FluxControlNetInpaintPipelineTests(unittest.TestCase, PipelineTesterMixin)
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, BertModel, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, BertModel, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -96,7 +96,10 @@ class HunyuanDiTControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMix
|
||||
scheduler = DDPMScheduler()
|
||||
text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -17,7 +17,14 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -28,10 +35,7 @@ from diffusers import (
|
||||
from diffusers.models import SD3ControlNetModel
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
torch_device,
|
||||
)
|
||||
from ...testing_utils import enable_full_determinism, torch_device
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
@@ -103,7 +107,8 @@ class StableDiffusion3ControlInpaintNetPipelineFastTests(unittest.TestCase, Pipe
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -18,7 +18,14 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -117,7 +124,8 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCosmos, CosmosTextToWorldPipeline, CosmosTransformer3DModel, EDMEulerScheduler
|
||||
|
||||
@@ -107,7 +107,8 @@ class CosmosTextToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
rho=7.0,
|
||||
final_sigmas_type="sigma_min",
|
||||
)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -232,6 +233,9 @@ class CosmosTextToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
@@ -95,7 +95,8 @@ class Cosmos2TextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(use_karras_sigmas=True)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -21,7 +21,7 @@ import unittest
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
@@ -96,7 +96,8 @@ class Cosmos2VideoToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(use_karras_sigmas=True)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -21,7 +21,7 @@ import unittest
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCosmos, CosmosTransformer3DModel, CosmosVideoToWorldPipeline, EDMEulerScheduler
|
||||
|
||||
@@ -108,7 +108,8 @@ class CosmosVideoToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
rho=7.0,
|
||||
final_sigmas_type="sigma_min",
|
||||
)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -245,6 +246,9 @@ class CosmosVideoToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -2,7 +2,7 @@ import tempfile
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import DDPMScheduler, UNet2DConditionModel
|
||||
from diffusers.models.attention_processor import AttnAddedKVProcessor
|
||||
@@ -18,7 +18,8 @@ from ..test_pipelines_common import to_np
|
||||
class IFPipelineTesterMixin:
|
||||
def _get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -75,7 +76,8 @@ class IFPipelineTesterMixin:
|
||||
|
||||
def _get_superresolution_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -250,6 +252,9 @@ class IFPipelineTesterMixin:
|
||||
# This should be handled in the base test and then this method can be removed.
|
||||
def _test_save_load_local(self):
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
@@ -18,9 +18,7 @@ import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import (
|
||||
IFPipeline,
|
||||
)
|
||||
from diffusers import IFPipeline
|
||||
from diffusers.models.attention_processor import AttnAddedKVProcessor
|
||||
from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -93,7 +93,8 @@ class FluxPipelineFastTests(
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxControlPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -53,7 +53,8 @@ class FluxControlPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -57,7 +57,8 @@ class FluxControlImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -58,7 +58,8 @@ class FluxControlInpaintPipelineFastTests(unittest.TestCase, PipelineTesterMixin
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxFillPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -58,7 +58,8 @@ class FluxFillPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxImg2ImgPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -55,7 +55,8 @@ class FluxImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxI
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxInpaintPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -55,7 +55,8 @@ class FluxInpaintPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxI
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -79,7 +79,8 @@ class FluxKontextPipelineFastTests(
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -79,7 +79,8 @@ class FluxKontextInpaintPipelineFastTests(
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -16,7 +16,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, GlmImagePipeline, GlmImageTransformer2DModel
|
||||
from diffusers.utils import is_transformers_version
|
||||
@@ -57,7 +57,8 @@ class GlmImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
glm_config = GlmImageConfig(
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
@@ -94,7 +95,8 @@ class HiDreamImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_4 = LlamaForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
|
||||
@@ -149,12 +151,12 @@ class HiDreamImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(generated_image.shape, (128, 128, 3))
|
||||
|
||||
# fmt: off
|
||||
expected_slice = np.array([0.4507, 0.5256, 0.4205, 0.5791, 0.4848, 0.4831, 0.4443, 0.5107, 0.6586, 0.3163, 0.7318, 0.5933, 0.6252, 0.5512, 0.5357, 0.5983])
|
||||
expected_slice = np.array([0.4501, 0.5256, 0.4207, 0.5783, 0.4842, 0.4833, 0.4441, 0.5112, 0.6587, 0.3169, 0.7308, 0.5927, 0.6251, 0.5509, 0.5355, 0.5969])
|
||||
# fmt: on
|
||||
|
||||
generated_slice = generated_image.flatten()
|
||||
generated_slice = np.concatenate([generated_slice[:8], generated_slice[-8:]])
|
||||
self.assertTrue(np.allclose(generated_slice, expected_slice, atol=1e-3))
|
||||
self.assertTrue(np.allclose(generated_slice, expected_slice, atol=5e-3))
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=3e-4)
|
||||
|
||||
@@ -223,7 +223,7 @@ class HunyuanImagePipelineFastTests(
|
||||
self.assertEqual(generated_image.shape, (3, 16, 16))
|
||||
|
||||
expected_slice_np = np.array(
|
||||
[0.61494756, 0.49616697, 0.60327923, 0.6115793, 0.49047345, 0.56977504, 0.53066164, 0.58880305, 0.5570612]
|
||||
[0.6068114, 0.48716035, 0.5984431, 0.60241306, 0.48849544, 0.5624479, 0.53696984, 0.58964247, 0.54248774]
|
||||
)
|
||||
output_slice = generated_image[0, -3:, -3:].flatten().cpu().numpy()
|
||||
|
||||
|
||||
@@ -233,7 +233,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
self.assertEqual(generated_video.shape, (5, 3, 16, 16))
|
||||
|
||||
# fmt: off
|
||||
expected_slice = torch.tensor([0.444, 0.479, 0.4485, 0.5752, 0.3539, 0.1548, 0.2706, 0.3593, 0.5323, 0.6635, 0.6795, 0.5255, 0.5091, 0.345, 0.4276, 0.4128])
|
||||
expected_slice = torch.tensor([0.4441, 0.4790, 0.4485, 0.5748, 0.3539, 0.1553, 0.2707, 0.3594, 0.5331, 0.6645, 0.6799, 0.5257, 0.5092, 0.3450, 0.4276, 0.4127])
|
||||
# fmt: on
|
||||
|
||||
generated_slice = generated_video.flatten()
|
||||
|
||||
@@ -15,7 +15,14 @@
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import ByT5Tokenizer, Qwen2_5_VLTextConfig, Qwen2_5_VLTextModel, Qwen2Tokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
ByT5Tokenizer,
|
||||
Qwen2_5_VLTextConfig,
|
||||
Qwen2_5_VLTextModel,
|
||||
Qwen2Tokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLHunyuanVideo15,
|
||||
@@ -114,7 +121,8 @@ class HunyuanVideo15PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
tokenizer_2 = ByT5Tokenizer()
|
||||
|
||||
guider = ClassifierFreeGuidance(guidance_scale=1.0)
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, BertModel, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, BertModel, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, DDPMScheduler, HunyuanDiT2DModel, HunyuanDiTPipeline
|
||||
|
||||
@@ -74,7 +74,9 @@ class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
scheduler = DDPMScheduler()
|
||||
text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoPipelineForImage2Image,
|
||||
@@ -108,7 +108,8 @@ class Kandinsky3PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
torch.manual_seed(0)
|
||||
movq = self.dummy_movq
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config).eval()
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -155,9 +156,9 @@ class Kandinsky3PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
assert image.shape == (1, 16, 16, 3)
|
||||
|
||||
expected_slice = np.array([0.3768, 0.4373, 0.4865, 0.4890, 0.4299, 0.5122, 0.4921, 0.4924, 0.5599])
|
||||
expected_slice = np.array([0.3944, 0.3680, 0.4842, 0.5333, 0.4412, 0.4812, 0.5089, 0.5381, 0.5578])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1, (
|
||||
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
|
||||
)
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoPipelineForImage2Image,
|
||||
@@ -119,7 +119,8 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
torch.manual_seed(0)
|
||||
movq = self.dummy_movq
|
||||
torch.manual_seed(0)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config).eval()
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -155,10 +156,7 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
return inputs
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_slice = np.array([0.5762, 0.6112, 0.4150, 0.6018, 0.6167, 0.4626, 0.5426, 0.5641, 0.6536])
|
||||
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
|
||||
super().test_dict_tuple_outputs_equivalent()
|
||||
|
||||
def test_kandinsky3_img2img(self):
|
||||
device = "cpu"
|
||||
@@ -177,11 +175,9 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array(
|
||||
[0.576259, 0.6132097, 0.41703486, 0.603196, 0.62062526, 0.4655338, 0.5434324, 0.5660727, 0.65433365]
|
||||
)
|
||||
expected_slice = np.array([0.5725, 0.6248, 0.4355, 0.5732, 0.6105, 0.5267, 0.5470, 0.5512, 0.6618])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1, (
|
||||
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
|
||||
)
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -109,7 +109,8 @@ class LattePipelineFastTests(
|
||||
vae = AutoencoderKL()
|
||||
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLLTXVideo, FlowMatchEulerDiscreteScheduler, LTXPipeline, LTXVideoTransformer3DModel
|
||||
|
||||
@@ -88,7 +88,8 @@ class LTXPipelineFastTests(PipelineTesterMixin, FirstBlockCacheTesterMixin, unit
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -17,7 +17,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLLTXVideo,
|
||||
@@ -92,7 +92,8 @@ class LTXConditionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -17,7 +17,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLLTXVideo,
|
||||
@@ -91,7 +91,8 @@ class LTXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
|
||||
|
||||
@@ -89,7 +89,8 @@ class MochiPipelineFastTests(
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -207,6 +208,9 @@ class MochiPipelineFastTests(
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, BertModel, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, BertModel, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -67,7 +67,9 @@ class HunyuanDiTPAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
scheduler = DDPMScheduler()
|
||||
text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
import diffusers
|
||||
from diffusers import (
|
||||
@@ -80,7 +80,8 @@ class PixArtSigmaPAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
vae = AutoencoderKL()
|
||||
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -3,7 +3,14 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -73,7 +80,9 @@ class StableDiffusion3PAGPipelineFastTests(unittest.TestCase, PipelineTesterMixi
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -5,7 +5,14 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -84,7 +91,9 @@ class StableDiffusion3PAGImg2ImgPipelineFastTests(unittest.TestCase, PipelineTes
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -77,7 +77,10 @@ class PixArtAlphaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
vae = AutoencoderKL()
|
||||
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -83,7 +83,10 @@ class PixArtSigmaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
vae = AutoencoderKL()
|
||||
|
||||
scheduler = DDIMScheduler()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.models.t5gemma.configuration_t5gemma import T5GemmaConfig, T5GemmaModuleConfig
|
||||
@@ -11,17 +10,11 @@ from diffusers.models import AutoencoderDC, AutoencoderKL
|
||||
from diffusers.models.transformers.transformer_prx import PRXTransformer2DModel
|
||||
from diffusers.pipelines.prx.pipeline_prx import PRXPipeline
|
||||
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
||||
from diffusers.utils import is_transformers_version
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
condition=is_transformers_version(">", "4.57.1"),
|
||||
reason="See https://github.com/huggingface/diffusers/pull/12456#issuecomment-3424228544",
|
||||
strict=False,
|
||||
)
|
||||
class PRXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = PRXPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
|
||||
@@ -99,7 +92,7 @@ class PRXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
}
|
||||
encoder_config = T5GemmaModuleConfig(**encoder_params)
|
||||
text_encoder_config = T5GemmaConfig(encoder=encoder_config, is_encoder_decoder=False, **encoder_params)
|
||||
text_encoder = T5GemmaEncoder(text_encoder_config)
|
||||
text_encoder = T5GemmaEncoder(text_encoder_config.encoder)
|
||||
|
||||
return {
|
||||
"transformer": transformer,
|
||||
@@ -263,3 +256,27 @@ class PRXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
expected_image = torch.zeros(3, 32, 32)
|
||||
max_diff = np.abs(generated_image - expected_image).max()
|
||||
self.assertLessEqual(max_diff, 1e10)
|
||||
|
||||
@unittest.skip("Custom T5GemmaEncoder not compatible with transformers v5.")
|
||||
def test_save_load_dduf(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Custom T5GemmaEncoder not compatible with transformers v5.")
|
||||
def test_loading_with_variants(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Custom T5GemmaEncoder not compatible with transformers v5.")
|
||||
def test_pipeline_with_accelerator_device_map(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Custom T5GemmaEncoder not compatible with transformers v5.")
|
||||
def test_save_load_local(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Custom T5GemmaEncoder not compatible with transformers v5.")
|
||||
def test_save_load_optional_components(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Custom T5GemmaEncoder not compatible with transformers v5.")
|
||||
def test_torch_dtype_dict(self):
|
||||
pass
|
||||
|
||||
@@ -113,7 +113,7 @@ class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
vision_start_token_id=151652,
|
||||
vision_token_id=151654,
|
||||
)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(config)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(config).eval()
|
||||
tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
|
||||
components = {
|
||||
@@ -160,12 +160,12 @@ class QwenImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(generated_image.shape, (3, 32, 32))
|
||||
|
||||
# fmt: off
|
||||
expected_slice = torch.tensor([0.56331, 0.63677, 0.6015, 0.56369, 0.58166, 0.55277, 0.57176, 0.63261, 0.41466, 0.35561, 0.56229, 0.48334, 0.49714, 0.52622, 0.40872, 0.50208])
|
||||
expected_slice = torch.tensor([0.5633, 0.6368, 0.6015, 0.5637, 0.5817, 0.5528, 0.5718, 0.6326, 0.4147, 0.3556, 0.5623, 0.4833, 0.4971, 0.5262, 0.4087, 0.5021])
|
||||
# fmt: on
|
||||
|
||||
generated_slice = generated_image.flatten()
|
||||
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=5e-3))
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-1)
|
||||
|
||||
@@ -211,7 +211,7 @@ class QwenControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
generated_slice = generated_image.flatten()
|
||||
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=5e-3))
|
||||
|
||||
def test_qwen_controlnet_multicondition(self):
|
||||
device = "cpu"
|
||||
@@ -255,7 +255,7 @@ class QwenControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
generated_slice = generated_image.flatten()
|
||||
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=5e-3))
|
||||
|
||||
def test_attention_slicing_forward_pass(
|
||||
self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
|
||||
|
||||
@@ -115,7 +115,7 @@ class QwenImageEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
vision_start_token_id=151652,
|
||||
vision_token_id=151654,
|
||||
)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(config)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(config).eval()
|
||||
tokenizer = Qwen2Tokenizer.from_pretrained(tiny_ckpt_id)
|
||||
|
||||
components = {
|
||||
@@ -163,12 +163,12 @@ class QwenImageEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(generated_image.shape, (3, 32, 32))
|
||||
|
||||
# fmt: off
|
||||
expected_slice = torch.tensor([[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]])
|
||||
expected_slice = torch.tensor([0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986])
|
||||
# fmt: on
|
||||
|
||||
generated_slice = generated_image.flatten()
|
||||
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
|
||||
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=5e-3))
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-1)
|
||||
|
||||
@@ -164,7 +164,7 @@ class QwenImageEditPlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
self.assertEqual(generated_image.shape, (3, 32, 32))
|
||||
|
||||
# fmt: off
|
||||
expected_slice = torch.tensor([[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]])
|
||||
expected_slice = torch.tensor([0.5640, 0.6339, 0.5997, 0.5607, 0.5799, 0.5496, 0.5760, 0.6393, 0.4172, 0.3595, 0.5655, 0.4896, 0.4971, 0.5255, 0.4088, 0.4987])
|
||||
# fmt: on
|
||||
|
||||
generated_slice = generated_image.flatten()
|
||||
|
||||
@@ -18,20 +18,11 @@ import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
SkyReelsV2Pipeline,
|
||||
SkyReelsV2Transformer3DModel,
|
||||
UniPCMultistepScheduler,
|
||||
)
|
||||
from diffusers import AutoencoderKLWan, SkyReelsV2Pipeline, SkyReelsV2Transformer3DModel, UniPCMultistepScheduler
|
||||
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
)
|
||||
from ...testing_utils import enable_full_determinism
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import (
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
@@ -25,13 +25,9 @@ from diffusers import (
|
||||
UniPCMultistepScheduler,
|
||||
)
|
||||
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
)
|
||||
from ...testing_utils import enable_full_determinism
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import (
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
@@ -17,10 +17,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
|
||||
@@ -27,14 +27,9 @@ from diffusers import (
|
||||
UniPCMultistepScheduler,
|
||||
)
|
||||
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
torch_device,
|
||||
)
|
||||
from ...testing_utils import enable_full_determinism, torch_device
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import (
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
@@ -19,10 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
T5EncoderModel,
|
||||
T5Tokenizer,
|
||||
)
|
||||
from transformers import AutoConfig, T5EncoderModel, T5Tokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderOobleck,
|
||||
@@ -111,7 +108,8 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
t5_repo_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"
|
||||
text_encoder = T5EncoderModel.from_pretrained(t5_repo_id)
|
||||
config = AutoConfig.from_pretrained(t5_repo_id)
|
||||
text_encoder = T5EncoderModel(config)
|
||||
tokenizer = T5Tokenizer.from_pretrained(t5_repo_id, truncation=True, model_max_length=25)
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
@@ -3,7 +3,14 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline
|
||||
|
||||
@@ -72,7 +79,9 @@ class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -4,7 +4,14 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -73,7 +80,9 @@ class StableDiffusion3Img2ImgPipelineFastTests(PipelineLatentTesterMixin, unitte
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -3,7 +3,14 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -73,7 +80,9 @@ class StableDiffusion3InpaintPipelineFastTests(PipelineLatentTesterMixin, unitte
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user