mirror of
https://github.com/huggingface/diffusers.git
synced 2026-03-05 16:20:39 +08:00
Compare commits
2 Commits
modular-no
...
pin-transf
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4efd3de674 | ||
|
|
685ee01154 |
14
.github/workflows/benchmark.yml
vendored
14
.github/workflows/benchmark.yml
vendored
@@ -62,6 +62,20 @@ jobs:
|
||||
with:
|
||||
name: benchmark_test_reports
|
||||
path: benchmarks/${{ env.BASE_PATH }}
|
||||
|
||||
# TODO: enable this once the connection problem has been resolved.
|
||||
- name: Update benchmarking results to DB
|
||||
env:
|
||||
PGDATABASE: metrics
|
||||
PGHOST: ${{ secrets.DIFFUSERS_BENCHMARKS_PGHOST }}
|
||||
PGUSER: transformers_benchmarks
|
||||
PGPASSWORD: ${{ secrets.DIFFUSERS_BENCHMARKS_PGPASSWORD }}
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
run: |
|
||||
git config --global --add safe.directory /__w/diffusers/diffusers
|
||||
commit_id=$GITHUB_SHA
|
||||
commit_msg=$(git show -s --format=%s "$commit_id" | cut -c1-70)
|
||||
cd benchmarks && python populate_into_db.py "$BRANCH_NAME" "$commit_id" "$commit_msg"
|
||||
|
||||
- name: Report success status
|
||||
if: ${{ success() }}
|
||||
|
||||
13
.github/workflows/pr_tests.yml
vendored
13
.github/workflows/pr_tests.yml
vendored
@@ -92,6 +92,7 @@ jobs:
|
||||
runner: aws-general-8-plus
|
||||
image: diffusers/diffusers-pytorch-cpu
|
||||
report: torch_example_cpu
|
||||
|
||||
name: ${{ matrix.config.name }}
|
||||
|
||||
runs-on:
|
||||
@@ -114,7 +115,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
|
||||
- name: Environment
|
||||
@@ -216,6 +218,8 @@ jobs:
|
||||
|
||||
run_lora_tests:
|
||||
needs: [check_code_quality, check_repository_consistency]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
|
||||
name: LoRA tests with PEFT main
|
||||
|
||||
@@ -243,8 +247,9 @@ jobs:
|
||||
uv pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
|
||||
uv pip install -U tokenizers
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -270,6 +275,6 @@ jobs:
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: pr_lora_test_reports
|
||||
name: pr_main_test_reports
|
||||
path: reports
|
||||
|
||||
|
||||
14
.github/workflows/pr_tests_gpu.yml
vendored
14
.github/workflows/pr_tests_gpu.yml
vendored
@@ -131,7 +131,8 @@ jobs:
|
||||
run: |
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -198,10 +199,16 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
# Install pkgs which depend on setuptools<81 for pkg_resources first with no build isolation
|
||||
uv pip install pip==25.2 setuptools==80.10.2
|
||||
uv pip install --no-build-isolation k-diffusion==0.0.12
|
||||
uv pip install --upgrade pip setuptools
|
||||
# Install the rest as normal
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -262,7 +269,8 @@ jobs:
|
||||
nvidia-smi
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers==4.57.1
|
||||
uv pip install -e ".[quality,training]"
|
||||
|
||||
- name: Environment
|
||||
|
||||
15
.github/workflows/push_tests.yml
vendored
15
.github/workflows/push_tests.yml
vendored
@@ -76,7 +76,9 @@ jobs:
|
||||
run: |
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
@@ -125,10 +127,16 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
# Install pkgs which depend on setuptools<81 for pkg_resources first with no build isolation
|
||||
uv pip install pip==25.2 setuptools==80.10.2
|
||||
uv pip install --no-build-isolation k-diffusion==0.0.12
|
||||
uv pip install --upgrade pip setuptools
|
||||
# Install the rest as normal
|
||||
uv pip install -e ".[quality]"
|
||||
uv pip install peft@git+https://github.com/huggingface/peft.git
|
||||
uv pip uninstall accelerate && uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers
|
||||
|
||||
- name: Environment
|
||||
run: |
|
||||
@@ -180,7 +188,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install -e ".[quality,training]"
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
#uv pip uninstall transformers huggingface_hub && uv pip install --prerelease allow -U transformers@git+https://github.com/huggingface/transformers.git
|
||||
uv pip uninstall transformers huggingface_hub && uv pip install transformers
|
||||
- name: Environment
|
||||
run: |
|
||||
python utils/print_env.py
|
||||
|
||||
2
.github/workflows/push_tests_mps.yml
vendored
2
.github/workflows/push_tests_mps.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
shell: arch -arch arm64 bash {0}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pip install --upgrade pip uv
|
||||
${CONDA_RUN} python -m uv pip install -e ".[quality]"
|
||||
${CONDA_RUN} python -m uv pip install -e ".[quality,test]"
|
||||
${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
|
||||
${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
|
||||
${CONDA_RUN} python -m uv pip install transformers --upgrade
|
||||
|
||||
3
.github/workflows/pypi_publish.yaml
vendored
3
.github/workflows/pypi_publish.yaml
vendored
@@ -54,6 +54,7 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install -U setuptools wheel twine
|
||||
pip install -U torch --index-url https://download.pytorch.org/whl/cpu
|
||||
pip install -U transformers
|
||||
|
||||
- name: Build the dist files
|
||||
run: python setup.py bdist_wheel && python setup.py sdist
|
||||
@@ -68,8 +69,6 @@ jobs:
|
||||
run: |
|
||||
pip install diffusers && pip uninstall diffusers -y
|
||||
pip install -i https://test.pypi.org/simple/ diffusers
|
||||
pip install -U transformers
|
||||
python utils/print_env.py
|
||||
python -c "from diffusers import __version__; print(__version__)"
|
||||
python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
|
||||
python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
|
||||
|
||||
166
benchmarks/populate_into_db.py
Normal file
166
benchmarks/populate_into_db.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
import gpustat
|
||||
import pandas as pd
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from psycopg2.extensions import register_adapter
|
||||
from psycopg2.extras import Json
|
||||
|
||||
|
||||
register_adapter(dict, Json)
|
||||
|
||||
FINAL_CSV_FILENAME = "collated_results.csv"
|
||||
# https://github.com/huggingface/transformers/blob/593e29c5e2a9b17baec010e8dc7c1431fed6e841/benchmark/init_db.sql#L27
|
||||
BENCHMARKS_TABLE_NAME = "benchmarks"
|
||||
MEASUREMENTS_TABLE_NAME = "model_measurements"
|
||||
|
||||
|
||||
def _init_benchmark(conn, branch, commit_id, commit_msg):
|
||||
gpu_stats = gpustat.GPUStatCollection.new_query()
|
||||
metadata = {"gpu_name": gpu_stats[0]["name"]}
|
||||
repository = "huggingface/diffusers"
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"INSERT INTO {BENCHMARKS_TABLE_NAME} (repository, branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s, %s) RETURNING benchmark_id",
|
||||
(repository, branch, commit_id, commit_msg, metadata),
|
||||
)
|
||||
benchmark_id = cur.fetchone()[0]
|
||||
print(f"Initialised benchmark #{benchmark_id}")
|
||||
return benchmark_id
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"branch",
|
||||
type=str,
|
||||
help="The branch name on which the benchmarking is performed.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"commit_id",
|
||||
type=str,
|
||||
help="The commit hash on which the benchmarking is performed.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"commit_msg",
|
||||
type=str,
|
||||
help="The commit message associated with the commit, truncated to 70 characters.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
try:
|
||||
conn = psycopg2.connect(
|
||||
host=os.getenv("PGHOST"),
|
||||
database=os.getenv("PGDATABASE"),
|
||||
user=os.getenv("PGUSER"),
|
||||
password=os.getenv("PGPASSWORD"),
|
||||
)
|
||||
print("DB connection established successfully.")
|
||||
except Exception as e:
|
||||
print(f"Problem during DB init: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
benchmark_id = _init_benchmark(
|
||||
conn=conn,
|
||||
branch=args.branch,
|
||||
commit_id=args.commit_id,
|
||||
commit_msg=args.commit_msg,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Problem during initializing benchmark: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
cur = conn.cursor()
|
||||
|
||||
df = pd.read_csv(FINAL_CSV_FILENAME)
|
||||
|
||||
# Helper to cast values (or None) given a dtype
|
||||
def _cast_value(val, dtype: str):
|
||||
if pd.isna(val):
|
||||
return None
|
||||
|
||||
if dtype == "text":
|
||||
return str(val).strip()
|
||||
|
||||
if dtype == "float":
|
||||
try:
|
||||
return float(val)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
if dtype == "bool":
|
||||
s = str(val).strip().lower()
|
||||
if s in ("true", "t", "yes", "1"):
|
||||
return True
|
||||
if s in ("false", "f", "no", "0"):
|
||||
return False
|
||||
if val in (1, 1.0):
|
||||
return True
|
||||
if val in (0, 0.0):
|
||||
return False
|
||||
return None
|
||||
|
||||
return val
|
||||
|
||||
try:
|
||||
rows_to_insert = []
|
||||
for _, row in df.iterrows():
|
||||
scenario = _cast_value(row.get("scenario"), "text")
|
||||
model_cls = _cast_value(row.get("model_cls"), "text")
|
||||
num_params_B = _cast_value(row.get("num_params_B"), "float")
|
||||
flops_G = _cast_value(row.get("flops_G"), "float")
|
||||
time_plain_s = _cast_value(row.get("time_plain_s"), "float")
|
||||
mem_plain_GB = _cast_value(row.get("mem_plain_GB"), "float")
|
||||
time_compile_s = _cast_value(row.get("time_compile_s"), "float")
|
||||
mem_compile_GB = _cast_value(row.get("mem_compile_GB"), "float")
|
||||
fullgraph = _cast_value(row.get("fullgraph"), "bool")
|
||||
mode = _cast_value(row.get("mode"), "text")
|
||||
|
||||
# If "github_sha" column exists in the CSV, cast it; else default to None
|
||||
if "github_sha" in df.columns:
|
||||
github_sha = _cast_value(row.get("github_sha"), "text")
|
||||
else:
|
||||
github_sha = None
|
||||
|
||||
measurements = {
|
||||
"scenario": scenario,
|
||||
"model_cls": model_cls,
|
||||
"num_params_B": num_params_B,
|
||||
"flops_G": flops_G,
|
||||
"time_plain_s": time_plain_s,
|
||||
"mem_plain_GB": mem_plain_GB,
|
||||
"time_compile_s": time_compile_s,
|
||||
"mem_compile_GB": mem_compile_GB,
|
||||
"fullgraph": fullgraph,
|
||||
"mode": mode,
|
||||
"github_sha": github_sha,
|
||||
}
|
||||
rows_to_insert.append((benchmark_id, measurements))
|
||||
|
||||
# Batch-insert all rows
|
||||
insert_sql = f"""
|
||||
INSERT INTO {MEASUREMENTS_TABLE_NAME} (
|
||||
benchmark_id,
|
||||
measurements
|
||||
)
|
||||
VALUES (%s, %s);
|
||||
"""
|
||||
|
||||
psycopg2.extras.execute_batch(cur, insert_sql, rows_to_insert)
|
||||
conn.commit()
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}")
|
||||
sys.exit(1)
|
||||
@@ -14,8 +14,4 @@
|
||||
|
||||
## AutoPipelineBlocks
|
||||
|
||||
[[autodoc]] diffusers.modular_pipelines.modular_pipeline.AutoPipelineBlocks
|
||||
|
||||
## ConditionalPipelineBlocks
|
||||
|
||||
[[autodoc]] diffusers.modular_pipelines.modular_pipeline.ConditionalPipelineBlocks
|
||||
[[autodoc]] diffusers.modular_pipelines.modular_pipeline.AutoPipelineBlocks
|
||||
@@ -46,20 +46,6 @@ output = pipe(
|
||||
output.save("output.png")
|
||||
```
|
||||
|
||||
## Cosmos2_5_TransferPipeline
|
||||
|
||||
[[autodoc]] Cosmos2_5_TransferPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
|
||||
## Cosmos2_5_PredictBasePipeline
|
||||
|
||||
[[autodoc]] Cosmos2_5_PredictBasePipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
|
||||
## CosmosTextToWorldPipeline
|
||||
|
||||
[[autodoc]] CosmosTextToWorldPipeline
|
||||
@@ -84,6 +70,12 @@ output.save("output.png")
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## Cosmos2_5_PredictBasePipeline
|
||||
|
||||
[[autodoc]] Cosmos2_5_PredictBasePipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## CosmosPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
|
||||
|
||||
@@ -29,7 +29,7 @@ Qwen-Image comes in the following variants:
|
||||
| Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) |
|
||||
|
||||
> [!TIP]
|
||||
> See the [Caching](../../optimization/cache) guide to speed up inference by storing and reusing intermediate outputs.
|
||||
> [Caching](../../optimization/cache) may also speed up inference by storing and reusing intermediate outputs.
|
||||
|
||||
## LoRA for faster inference
|
||||
|
||||
@@ -190,12 +190,6 @@ For detailed benchmark scripts and results, see [this gist](https://gist.github.
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## QwenImageLayeredPipeline
|
||||
|
||||
[[autodoc]] QwenImageLayeredPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## QwenImagePipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
|
||||
@@ -121,7 +121,7 @@ from diffusers.modular_pipelines import AutoPipelineBlocks
|
||||
|
||||
class AutoImageBlocks(AutoPipelineBlocks):
|
||||
# List of sub-block classes to choose from
|
||||
block_classes = [InpaintBlock, ImageToImageBlock, TextToImageBlock]
|
||||
block_classes = [block_inpaint_cls, block_i2i_cls, block_t2i_cls]
|
||||
# Names for each block in the same order
|
||||
block_names = ["inpaint", "img2img", "text2img"]
|
||||
# Trigger inputs that determine which block to run
|
||||
@@ -129,8 +129,8 @@ class AutoImageBlocks(AutoPipelineBlocks):
|
||||
# - "image" triggers img2img workflow (but only if mask is not provided)
|
||||
# - if none of above, runs the text2img workflow (default)
|
||||
block_trigger_inputs = ["mask", "image", None]
|
||||
# Description is extremely important for AutoPipelineBlocks
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Pipeline generates images given different types of conditions!\n"
|
||||
@@ -141,7 +141,7 @@ class AutoImageBlocks(AutoPipelineBlocks):
|
||||
)
|
||||
```
|
||||
|
||||
It is **very** important to include a `description` to avoid any confusion over how to run a block and what inputs are required. While [`~modular_pipelines.AutoPipelineBlocks`] are convenient, its conditional logic may be difficult to figure out if it isn't properly explained.
|
||||
It is **very** important to include a `description` to avoid any confusion over how to run a block and what inputs are required. While [`~modular_pipelines.AutoPipelineBlocks`] are convenient, it's conditional logic may be difficult to figure out if it isn't properly explained.
|
||||
|
||||
Create an instance of `AutoImageBlocks`.
|
||||
|
||||
@@ -152,74 +152,5 @@ auto_blocks = AutoImageBlocks()
|
||||
For more complex compositions, such as nested [`~modular_pipelines.AutoPipelineBlocks`] blocks when they're used as sub-blocks in larger pipelines, use the [`~modular_pipelines.SequentialPipelineBlocks.get_execution_blocks`] method to extract the a block that is actually run based on your input.
|
||||
|
||||
```py
|
||||
auto_blocks.get_execution_blocks(mask=True)
|
||||
```
|
||||
|
||||
## ConditionalPipelineBlocks
|
||||
|
||||
[`~modular_pipelines.AutoPipelineBlocks`] is a special case of [`~modular_pipelines.ConditionalPipelineBlocks`]. While [`~modular_pipelines.AutoPipelineBlocks`] selects blocks based on whether a trigger input is provided or not, [`~modular_pipelines.ConditionalPipelineBlocks`] is able to select a block based on custom selection logic provided in the `select_block` method.
|
||||
|
||||
Here is the same example written using [`~modular_pipelines.ConditionalPipelineBlocks`] directly:
|
||||
|
||||
```py
|
||||
from diffusers.modular_pipelines import ConditionalPipelineBlocks
|
||||
|
||||
class AutoImageBlocks(ConditionalPipelineBlocks):
|
||||
block_classes = [InpaintBlock, ImageToImageBlock, TextToImageBlock]
|
||||
block_names = ["inpaint", "img2img", "text2img"]
|
||||
block_trigger_inputs = ["mask", "image"]
|
||||
default_block_name = "text2img"
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
return (
|
||||
"Pipeline generates images given different types of conditions!\n"
|
||||
+ "This is an auto pipeline block that works for text2img, img2img and inpainting tasks.\n"
|
||||
+ " - inpaint workflow is run when `mask` is provided.\n"
|
||||
+ " - img2img workflow is run when `image` is provided (but only when `mask` is not provided).\n"
|
||||
+ " - text2img workflow is run when neither `image` nor `mask` is provided.\n"
|
||||
)
|
||||
|
||||
def select_block(self, mask=None, image=None) -> str | None:
|
||||
if mask is not None:
|
||||
return "inpaint"
|
||||
if image is not None:
|
||||
return "img2img"
|
||||
return None # falls back to default_block_name ("text2img")
|
||||
```
|
||||
|
||||
The inputs listed in `block_trigger_inputs` are passed as keyword arguments to `select_block()`. When `select_block` returns `None`, it falls back to `default_block_name`. If `default_block_name` is also `None`, the entire conditional block is skipped — this is useful for optional processing steps that should only run when specific inputs are provided.
|
||||
|
||||
## Workflows
|
||||
|
||||
Pipelines that contain conditional blocks ([`~modular_pipelines.AutoPipelineBlocks`] or [`~modular_pipelines.ConditionalPipelineBlocks]`) can support multiple workflows — for example, our SDXL modular pipeline supports a dozen workflows all in one pipeline. But this also means it can be confusing for users to know what workflows are supported and how to run them. For pipeline builders, it's useful to be able to extract only the blocks relevant to a specific workflow.
|
||||
|
||||
We recommend defining a `_workflow_map` to give each workflow a name and explicitly list the inputs it requires.
|
||||
|
||||
```py
|
||||
from diffusers.modular_pipelines import SequentialPipelineBlocks
|
||||
|
||||
class MyPipelineBlocks(SequentialPipelineBlocks):
|
||||
block_classes = [TextEncoderBlock, AutoImageBlocks, DecodeBlock]
|
||||
block_names = ["text_encoder", "auto_image", "decode"]
|
||||
|
||||
_workflow_map = {
|
||||
"text2image": {"prompt": True},
|
||||
"image2image": {"image": True, "prompt": True},
|
||||
"inpaint": {"mask": True, "image": True, "prompt": True},
|
||||
}
|
||||
```
|
||||
|
||||
All of our built-in modular pipelines come with pre-defined workflows. The `available_workflows` property lists all supported workflows:
|
||||
|
||||
```py
|
||||
pipeline_blocks = MyPipelineBlocks()
|
||||
pipeline_blocks.available_workflows
|
||||
# ['text2image', 'image2image', 'inpaint']
|
||||
```
|
||||
|
||||
Retrieve a specific workflow with `get_workflow` to inspect and debug a specific block that executes the workflow.
|
||||
|
||||
```py
|
||||
pipeline_blocks.get_workflow("inpaint")
|
||||
auto_blocks.get_execution_blocks("mask")
|
||||
```
|
||||
@@ -111,7 +111,7 @@ if __name__ == "__main__":
|
||||
Call `torchrun` to run the inference script and use the `--nproc_per_node` argument to set the number of GPUs to use.
|
||||
|
||||
```bash
|
||||
torchrun --nproc_per_node=2 run_distributed.py
|
||||
torchrun run_distributed.py --nproc_per_node=2
|
||||
```
|
||||
|
||||
## device_map
|
||||
|
||||
@@ -97,32 +97,5 @@ If the custom model inherits from the [`ModelMixin`] class, it gets access to th
|
||||
> )
|
||||
> ```
|
||||
|
||||
### Saving custom models
|
||||
|
||||
Use [`~ConfigMixin.register_for_auto_class`] to add the `auto_map` entry to `config.json` automatically when saving. This avoids having to manually edit the config file.
|
||||
|
||||
```py
|
||||
# my_model.py
|
||||
from diffusers import ModelMixin, ConfigMixin
|
||||
|
||||
class MyCustomModel(ModelMixin, ConfigMixin):
|
||||
...
|
||||
|
||||
MyCustomModel.register_for_auto_class("AutoModel")
|
||||
|
||||
model = MyCustomModel(...)
|
||||
model.save_pretrained("./my_model")
|
||||
```
|
||||
|
||||
The saved `config.json` will include the `auto_map` field.
|
||||
|
||||
```json
|
||||
{
|
||||
"auto_map": {
|
||||
"AutoModel": "my_model.MyCustomModel"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> Learn more about implementing custom models in the [Community components](../using-diffusers/custom_pipeline_overview#community-components) guide.
|
||||
@@ -17,9 +17,6 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from diffusers.utils import is_transformers_version
|
||||
|
||||
|
||||
sys.path.append("..")
|
||||
@@ -33,7 +30,6 @@ stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
|
||||
@unittest.skipIf(is_transformers_version(">=", "4.57.5"), "Size mismatch")
|
||||
class CustomDiffusion(ExamplesTestsAccelerate):
|
||||
def test_custom_diffusion(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -94,15 +94,9 @@ python scripts/convert_cosmos_to_diffusers.py \
|
||||
--transformer_type Cosmos-2.5-Transfer-General-2B \
|
||||
--transformer_ckpt_path $transformer_ckpt_path \
|
||||
--vae_type wan2.1 \
|
||||
--output_path converted/transfer/2b/general/depth/pipeline \
|
||||
--output_path converted/transfer/2b/general/depth \
|
||||
--save_pipeline
|
||||
|
||||
python scripts/convert_cosmos_to_diffusers.py \
|
||||
--transformer_type Cosmos-2.5-Transfer-General-2B \
|
||||
--transformer_ckpt_path $transformer_ckpt_path \
|
||||
--vae_type wan2.1 \
|
||||
--output_path converted/transfer/2b/general/depth/models
|
||||
|
||||
# edge
|
||||
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Transfer2.5-2B/snapshots/eb5325b77d358944da58a690157dd2b8071bbf85/general/edge/61f5694b-0ad5-4ecd-8ad7-c8545627d125_ema_bf16.pt
|
||||
|
||||
@@ -126,15 +120,9 @@ python scripts/convert_cosmos_to_diffusers.py \
|
||||
--transformer_type Cosmos-2.5-Transfer-General-2B \
|
||||
--transformer_ckpt_path $transformer_ckpt_path \
|
||||
--vae_type wan2.1 \
|
||||
--output_path converted/transfer/2b/general/blur/pipeline \
|
||||
--output_path converted/transfer/2b/general/blur \
|
||||
--save_pipeline
|
||||
|
||||
python scripts/convert_cosmos_to_diffusers.py \
|
||||
--transformer_type Cosmos-2.5-Transfer-General-2B \
|
||||
--transformer_ckpt_path $transformer_ckpt_path \
|
||||
--vae_type wan2.1 \
|
||||
--output_path converted/transfer/2b/general/blur/models
|
||||
|
||||
# seg
|
||||
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Transfer2.5-2B/snapshots/eb5325b77d358944da58a690157dd2b8071bbf85/general/seg/5136ef49-6d8d-42e8-8abf-7dac722a304a_ema_bf16.pt
|
||||
|
||||
@@ -142,14 +130,8 @@ python scripts/convert_cosmos_to_diffusers.py \
|
||||
--transformer_type Cosmos-2.5-Transfer-General-2B \
|
||||
--transformer_ckpt_path $transformer_ckpt_path \
|
||||
--vae_type wan2.1 \
|
||||
--output_path converted/transfer/2b/general/seg/pipeline \
|
||||
--output_path converted/transfer/2b/general/seg \
|
||||
--save_pipeline
|
||||
|
||||
python scripts/convert_cosmos_to_diffusers.py \
|
||||
--transformer_type Cosmos-2.5-Transfer-General-2B \
|
||||
--transformer_ckpt_path $transformer_ckpt_path \
|
||||
--vae_type wan2.1 \
|
||||
--output_path converted/transfer/2b/general/seg/models
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
4
setup.py
4
setup.py
@@ -101,7 +101,6 @@ _deps = [
|
||||
"datasets",
|
||||
"filelock",
|
||||
"flax>=0.4.1",
|
||||
"ftfy",
|
||||
"hf-doc-builder>=0.3.0",
|
||||
"httpx<1.0.0",
|
||||
"huggingface-hub>=0.34.0,<2.0",
|
||||
@@ -222,14 +221,12 @@ extras["docs"] = deps_list("hf-doc-builder")
|
||||
extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft", "timm")
|
||||
extras["test"] = deps_list(
|
||||
"compel",
|
||||
"ftfy",
|
||||
"GitPython",
|
||||
"datasets",
|
||||
"Jinja2",
|
||||
"invisible-watermark",
|
||||
"librosa",
|
||||
"parameterized",
|
||||
"protobuf",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
"pytest-xdist",
|
||||
@@ -238,7 +235,6 @@ extras["test"] = deps_list(
|
||||
"sentencepiece",
|
||||
"scipy",
|
||||
"tiktoken",
|
||||
"torchsde",
|
||||
"torchvision",
|
||||
"transformers",
|
||||
"phonemizer",
|
||||
|
||||
@@ -107,38 +107,6 @@ class ConfigMixin:
|
||||
has_compatibles = False
|
||||
|
||||
_deprecated_kwargs = []
|
||||
_auto_class = None
|
||||
|
||||
@classmethod
|
||||
def register_for_auto_class(cls, auto_class="AutoModel"):
|
||||
"""
|
||||
Register this class with the given auto class so that it can be loaded with `AutoModel.from_pretrained(...,
|
||||
trust_remote_code=True)`.
|
||||
|
||||
When the config is saved, the resulting `config.json` will include an `auto_map` entry mapping the auto class
|
||||
to this class's module and class name.
|
||||
|
||||
Args:
|
||||
auto_class (`str` or type, *optional*, defaults to `"AutoModel"`):
|
||||
The auto class to register this class with. Can be a string (e.g. `"AutoModel"`) or the class itself.
|
||||
Currently only `"AutoModel"` is supported.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
from diffusers import ModelMixin, ConfigMixin
|
||||
|
||||
|
||||
class MyCustomModel(ModelMixin, ConfigMixin): ...
|
||||
|
||||
|
||||
MyCustomModel.register_for_auto_class("AutoModel")
|
||||
```
|
||||
"""
|
||||
if auto_class != "AutoModel":
|
||||
raise ValueError(f"Only 'AutoModel' is supported, got '{auto_class}'.")
|
||||
|
||||
cls._auto_class = auto_class
|
||||
|
||||
def register_to_config(self, **kwargs):
|
||||
if self.config_name is None:
|
||||
@@ -653,12 +621,6 @@ class ConfigMixin:
|
||||
# pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
|
||||
_ = config_dict.pop("_pre_quantization_dtype", None)
|
||||
|
||||
if getattr(self, "_auto_class", None) is not None:
|
||||
module = self.__class__.__module__.split(".")[-1]
|
||||
auto_map = config_dict.get("auto_map", {})
|
||||
auto_map[self._auto_class] = f"{module}.{self.__class__.__name__}"
|
||||
config_dict["auto_map"] = auto_map
|
||||
|
||||
return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
|
||||
|
||||
def to_json_file(self, json_file_path: str | os.PathLike):
|
||||
|
||||
@@ -8,7 +8,6 @@ deps = {
|
||||
"datasets": "datasets",
|
||||
"filelock": "filelock",
|
||||
"flax": "flax>=0.4.1",
|
||||
"ftfy": "ftfy",
|
||||
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
||||
"httpx": "httpx<1.0.0",
|
||||
"huggingface-hub": "huggingface-hub>=0.34.0,<2.0",
|
||||
|
||||
@@ -48,7 +48,6 @@ _GO_LC_SUPPORTED_PYTORCH_LAYERS = (
|
||||
torch.nn.ConvTranspose2d,
|
||||
torch.nn.ConvTranspose3d,
|
||||
torch.nn.Linear,
|
||||
torch.nn.Embedding,
|
||||
# TODO(aryan): look into torch.nn.LayerNorm, torch.nn.GroupNorm later, seems to be causing some issues with CogVideoX
|
||||
# because of double invocation of the same norm layer in CogVideoXLayerNorm
|
||||
)
|
||||
|
||||
@@ -856,7 +856,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
)
|
||||
state_dict = {k: v for k, v in state_dict.items() if not k.startswith("text_encoders.t5xxl.transformer.")}
|
||||
|
||||
has_diffb = any("diff_b" in k and k.startswith(("lora_unet_", "lora_te_", "lora_te1_")) for k in state_dict)
|
||||
has_diffb = any("diff_b" in k and k.startswith(("lora_unet_", "lora_te_")) for k in state_dict)
|
||||
if has_diffb:
|
||||
zero_status_diff_b = state_dict_all_zero(state_dict, ".diff_b")
|
||||
if zero_status_diff_b:
|
||||
@@ -895,7 +895,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
state_dict = {
|
||||
_custom_replace(k, limit_substrings): v
|
||||
for k, v in state_dict.items()
|
||||
if k.startswith(("lora_unet_", "lora_te_", "lora_te1_"))
|
||||
if k.startswith(("lora_unet_", "lora_te_"))
|
||||
}
|
||||
|
||||
if any("text_projection" in k for k in state_dict):
|
||||
|
||||
@@ -5472,10 +5472,6 @@ class Flux2LoraLoaderMixin(LoraBaseMixin):
|
||||
logger.warning(warn_msg)
|
||||
state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
|
||||
|
||||
is_peft_format = any(k.startswith("base_model.model.") for k in state_dict)
|
||||
if is_peft_format:
|
||||
state_dict = {k.replace("base_model.model.", "diffusion_model."): v for k, v in state_dict.items()}
|
||||
|
||||
is_ai_toolkit = any(k.startswith("diffusion_model.") for k in state_dict)
|
||||
if is_ai_toolkit:
|
||||
state_dict = _convert_non_diffusers_flux2_lora_to_diffusers(state_dict)
|
||||
|
||||
@@ -22,12 +22,7 @@ from tokenizers import Tokenizer as TokenizerFast
|
||||
from torch import nn
|
||||
|
||||
from ..models.modeling_utils import load_state_dict
|
||||
from ..utils import (
|
||||
_get_model_file,
|
||||
is_accelerate_available,
|
||||
is_transformers_available,
|
||||
logging,
|
||||
)
|
||||
from ..utils import _get_model_file, is_accelerate_available, is_transformers_available, logging
|
||||
|
||||
|
||||
if is_transformers_available():
|
||||
|
||||
@@ -62,8 +62,6 @@ _REQUIRED_FLEX_VERSION = "2.5.0"
|
||||
_REQUIRED_XLA_VERSION = "2.2"
|
||||
_REQUIRED_XFORMERS_VERSION = "0.0.29"
|
||||
|
||||
logger = get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
_CAN_USE_FLASH_ATTN = is_flash_attn_available() and is_flash_attn_version(">=", _REQUIRED_FLASH_VERSION)
|
||||
_CAN_USE_FLASH_ATTN_3 = is_flash_attn_3_available()
|
||||
_CAN_USE_AITER_ATTN = is_aiter_available() and is_aiter_version(">=", _REQUIRED_AITER_VERSION)
|
||||
@@ -75,18 +73,8 @@ _CAN_USE_XFORMERS_ATTN = is_xformers_available() and is_xformers_version(">=", _
|
||||
|
||||
|
||||
if _CAN_USE_FLASH_ATTN:
|
||||
try:
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
# Handle ABI mismatch or other import failures gracefully.
|
||||
# This can happen when flash_attn was compiled against a different PyTorch version.
|
||||
logger.warning(f"flash_attn is installed but failed to import: {e}. Falling back to native PyTorch attention.")
|
||||
_CAN_USE_FLASH_ATTN = False
|
||||
flash_attn_func = None
|
||||
flash_attn_varlen_func = None
|
||||
_wrapped_flash_attn_backward = None
|
||||
_wrapped_flash_attn_forward = None
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward
|
||||
else:
|
||||
flash_attn_func = None
|
||||
flash_attn_varlen_func = None
|
||||
@@ -95,47 +83,26 @@ else:
|
||||
|
||||
|
||||
if _CAN_USE_FLASH_ATTN_3:
|
||||
try:
|
||||
from flash_attn_interface import flash_attn_func as flash_attn_3_func
|
||||
from flash_attn_interface import flash_attn_varlen_func as flash_attn_3_varlen_func
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"flash_attn_3 failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_FLASH_ATTN_3 = False
|
||||
flash_attn_3_func = None
|
||||
flash_attn_3_varlen_func = None
|
||||
from flash_attn_interface import flash_attn_func as flash_attn_3_func
|
||||
from flash_attn_interface import flash_attn_varlen_func as flash_attn_3_varlen_func
|
||||
else:
|
||||
flash_attn_3_func = None
|
||||
flash_attn_3_varlen_func = None
|
||||
|
||||
if _CAN_USE_AITER_ATTN:
|
||||
try:
|
||||
from aiter import flash_attn_func as aiter_flash_attn_func
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"aiter failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_AITER_ATTN = False
|
||||
aiter_flash_attn_func = None
|
||||
from aiter import flash_attn_func as aiter_flash_attn_func
|
||||
else:
|
||||
aiter_flash_attn_func = None
|
||||
|
||||
if _CAN_USE_SAGE_ATTN:
|
||||
try:
|
||||
from sageattention import (
|
||||
sageattn,
|
||||
sageattn_qk_int8_pv_fp8_cuda,
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90,
|
||||
sageattn_qk_int8_pv_fp16_cuda,
|
||||
sageattn_qk_int8_pv_fp16_triton,
|
||||
sageattn_varlen,
|
||||
)
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"sageattention failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_SAGE_ATTN = False
|
||||
sageattn = None
|
||||
sageattn_qk_int8_pv_fp8_cuda = None
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90 = None
|
||||
sageattn_qk_int8_pv_fp16_cuda = None
|
||||
sageattn_qk_int8_pv_fp16_triton = None
|
||||
sageattn_varlen = None
|
||||
from sageattention import (
|
||||
sageattn,
|
||||
sageattn_qk_int8_pv_fp8_cuda,
|
||||
sageattn_qk_int8_pv_fp8_cuda_sm90,
|
||||
sageattn_qk_int8_pv_fp16_cuda,
|
||||
sageattn_qk_int8_pv_fp16_triton,
|
||||
sageattn_varlen,
|
||||
)
|
||||
else:
|
||||
sageattn = None
|
||||
sageattn_qk_int8_pv_fp16_cuda = None
|
||||
@@ -146,48 +113,26 @@ else:
|
||||
|
||||
|
||||
if _CAN_USE_FLEX_ATTN:
|
||||
try:
|
||||
# We cannot import the flex_attention function from the package directly because it is expected (from the
|
||||
# pytorch documentation) that the user may compile it. If we import directly, we will not have access to the
|
||||
# compiled function.
|
||||
import torch.nn.attention.flex_attention as flex_attention
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"flex_attention failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_FLEX_ATTN = False
|
||||
flex_attention = None
|
||||
else:
|
||||
flex_attention = None
|
||||
# We cannot import the flex_attention function from the package directly because it is expected (from the
|
||||
# pytorch documentation) that the user may compile it. If we import directly, we will not have access to the
|
||||
# compiled function.
|
||||
import torch.nn.attention.flex_attention as flex_attention
|
||||
|
||||
|
||||
if _CAN_USE_NPU_ATTN:
|
||||
try:
|
||||
from torch_npu import npu_fusion_attention
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"torch_npu failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_NPU_ATTN = False
|
||||
npu_fusion_attention = None
|
||||
from torch_npu import npu_fusion_attention
|
||||
else:
|
||||
npu_fusion_attention = None
|
||||
|
||||
|
||||
if _CAN_USE_XLA_ATTN:
|
||||
try:
|
||||
from torch_xla.experimental.custom_kernel import flash_attention as xla_flash_attention
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"torch_xla failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_XLA_ATTN = False
|
||||
xla_flash_attention = None
|
||||
from torch_xla.experimental.custom_kernel import flash_attention as xla_flash_attention
|
||||
else:
|
||||
xla_flash_attention = None
|
||||
|
||||
|
||||
if _CAN_USE_XFORMERS_ATTN:
|
||||
try:
|
||||
import xformers.ops as xops
|
||||
except (ImportError, OSError, RuntimeError) as e:
|
||||
logger.warning(f"xformers failed to import: {e}. Falling back to native attention.")
|
||||
_CAN_USE_XFORMERS_ATTN = False
|
||||
xops = None
|
||||
import xformers.ops as xops
|
||||
else:
|
||||
xops = None
|
||||
|
||||
@@ -213,6 +158,8 @@ else:
|
||||
_register_fake = register_fake_no_op
|
||||
|
||||
|
||||
logger = get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
# TODO(aryan): Add support for the following:
|
||||
# - Sage Attention++
|
||||
# - block sparse, radial and other attention methods
|
||||
@@ -319,21 +266,13 @@ class _HubKernelConfig:
|
||||
function_attr: str
|
||||
revision: str | None = None
|
||||
kernel_fn: Callable | None = None
|
||||
wrapped_forward_attr: str | None = None
|
||||
wrapped_backward_attr: str | None = None
|
||||
wrapped_forward_fn: Callable | None = None
|
||||
wrapped_backward_fn: Callable | None = None
|
||||
|
||||
|
||||
# Registry for hub-based attention kernels
|
||||
_HUB_KERNELS_REGISTRY: dict["AttentionBackendName", _HubKernelConfig] = {
|
||||
# TODO: temporary revision for now. Remove when merged upstream into `main`.
|
||||
AttentionBackendName._FLASH_3_HUB: _HubKernelConfig(
|
||||
repo_id="kernels-community/flash-attn3",
|
||||
function_attr="flash_attn_func",
|
||||
revision="fake-ops-return-probs",
|
||||
wrapped_forward_attr="flash_attn_interface._flash_attn_forward",
|
||||
wrapped_backward_attr="flash_attn_interface._flash_attn_backward",
|
||||
repo_id="kernels-community/flash-attn3", function_attr="flash_attn_func", revision="fake-ops-return-probs"
|
||||
),
|
||||
AttentionBackendName._FLASH_3_VARLEN_HUB: _HubKernelConfig(
|
||||
repo_id="kernels-community/flash-attn3",
|
||||
@@ -341,11 +280,7 @@ _HUB_KERNELS_REGISTRY: dict["AttentionBackendName", _HubKernelConfig] = {
|
||||
# revision="fake-ops-return-probs",
|
||||
),
|
||||
AttentionBackendName.FLASH_HUB: _HubKernelConfig(
|
||||
repo_id="kernels-community/flash-attn2",
|
||||
function_attr="flash_attn_func",
|
||||
revision=None,
|
||||
wrapped_forward_attr="flash_attn_interface._wrapped_flash_attn_forward",
|
||||
wrapped_backward_attr="flash_attn_interface._wrapped_flash_attn_backward",
|
||||
repo_id="kernels-community/flash-attn2", function_attr="flash_attn_func", revision=None
|
||||
),
|
||||
AttentionBackendName.FLASH_VARLEN_HUB: _HubKernelConfig(
|
||||
repo_id="kernels-community/flash-attn2", function_attr="flash_attn_varlen_func", revision=None
|
||||
@@ -670,39 +605,22 @@ def _flex_attention_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
|
||||
|
||||
|
||||
# ===== Helpers for downloading kernels =====
|
||||
def _resolve_kernel_attr(module, attr_path: str):
|
||||
target = module
|
||||
for attr in attr_path.split("."):
|
||||
if not hasattr(target, attr):
|
||||
raise AttributeError(f"Kernel module '{module.__name__}' does not define attribute path '{attr_path}'.")
|
||||
target = getattr(target, attr)
|
||||
return target
|
||||
|
||||
|
||||
def _maybe_download_kernel_for_backend(backend: AttentionBackendName) -> None:
|
||||
if backend not in _HUB_KERNELS_REGISTRY:
|
||||
return
|
||||
config = _HUB_KERNELS_REGISTRY[backend]
|
||||
|
||||
needs_kernel = config.kernel_fn is None
|
||||
needs_wrapped_forward = config.wrapped_forward_attr is not None and config.wrapped_forward_fn is None
|
||||
needs_wrapped_backward = config.wrapped_backward_attr is not None and config.wrapped_backward_fn is None
|
||||
|
||||
if not (needs_kernel or needs_wrapped_forward or needs_wrapped_backward):
|
||||
if config.kernel_fn is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
from kernels import get_kernel
|
||||
|
||||
kernel_module = get_kernel(config.repo_id, revision=config.revision)
|
||||
if needs_kernel:
|
||||
config.kernel_fn = _resolve_kernel_attr(kernel_module, config.function_attr)
|
||||
kernel_func = getattr(kernel_module, config.function_attr)
|
||||
|
||||
if needs_wrapped_forward:
|
||||
config.wrapped_forward_fn = _resolve_kernel_attr(kernel_module, config.wrapped_forward_attr)
|
||||
|
||||
if needs_wrapped_backward:
|
||||
config.wrapped_backward_fn = _resolve_kernel_attr(kernel_module, config.wrapped_backward_attr)
|
||||
# Cache the downloaded kernel function in the config object
|
||||
config.kernel_fn = kernel_func
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"An error occurred while fetching kernel '{config.repo_id}' from the Hub: {e}")
|
||||
@@ -733,7 +651,7 @@ def _wrapped_flash_attn_3(
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Hardcoded for now because pytorch does not support tuple/int type hints
|
||||
window_size = (-1, -1)
|
||||
result = flash_attn_3_func(
|
||||
out, lse, *_ = flash_attn_3_func(
|
||||
q=q,
|
||||
k=k,
|
||||
v=v,
|
||||
@@ -750,9 +668,7 @@ def _wrapped_flash_attn_3(
|
||||
pack_gqa=pack_gqa,
|
||||
deterministic=deterministic,
|
||||
sm_margin=sm_margin,
|
||||
return_attn_probs=True,
|
||||
)
|
||||
out, lse, *_ = result
|
||||
lse = lse.permute(0, 2, 1)
|
||||
return out, lse
|
||||
|
||||
@@ -1155,258 +1071,6 @@ def _flash_attention_backward_op(
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _flash_attention_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for flash-attn hub kernels.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for flash-attn hub kernels.")
|
||||
|
||||
config = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_HUB]
|
||||
wrapped_forward_fn = config.wrapped_forward_fn
|
||||
wrapped_backward_fn = config.wrapped_backward_fn
|
||||
if wrapped_forward_fn is None or wrapped_backward_fn is None:
|
||||
raise RuntimeError(
|
||||
"Flash attention hub kernels must expose `_wrapped_flash_attn_forward` and `_wrapped_flash_attn_backward` "
|
||||
"for context parallel execution."
|
||||
)
|
||||
|
||||
if scale is None:
|
||||
scale = query.shape[-1] ** (-0.5)
|
||||
|
||||
window_size = (-1, -1)
|
||||
softcap = 0.0
|
||||
alibi_slopes = None
|
||||
deterministic = False
|
||||
grad_enabled = any(x.requires_grad for x in (query, key, value))
|
||||
|
||||
if grad_enabled or (_parallel_config is not None and _parallel_config.context_parallel_config._world_size > 1):
|
||||
dropout_p = dropout_p if dropout_p > 0 else 1e-30
|
||||
|
||||
with torch.set_grad_enabled(grad_enabled):
|
||||
out, lse, S_dmask, rng_state = wrapped_forward_fn(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
dropout_p,
|
||||
scale,
|
||||
is_causal,
|
||||
window_size[0],
|
||||
window_size[1],
|
||||
softcap,
|
||||
alibi_slopes,
|
||||
return_lse,
|
||||
)
|
||||
lse = lse.permute(0, 2, 1).contiguous()
|
||||
|
||||
if _save_ctx:
|
||||
ctx.save_for_backward(query, key, value, out, lse, rng_state)
|
||||
ctx.dropout_p = dropout_p
|
||||
ctx.scale = scale
|
||||
ctx.is_causal = is_causal
|
||||
ctx.window_size = window_size
|
||||
ctx.softcap = softcap
|
||||
ctx.alibi_slopes = alibi_slopes
|
||||
ctx.deterministic = deterministic
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _flash_attention_hub_backward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
grad_out: torch.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
config = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_HUB]
|
||||
wrapped_backward_fn = config.wrapped_backward_fn
|
||||
if wrapped_backward_fn is None:
|
||||
raise RuntimeError(
|
||||
"Flash attention hub kernels must expose `_wrapped_flash_attn_backward` for context parallel execution."
|
||||
)
|
||||
|
||||
query, key, value, out, lse, rng_state = ctx.saved_tensors
|
||||
grad_query, grad_key, grad_value = torch.empty_like(query), torch.empty_like(key), torch.empty_like(value)
|
||||
|
||||
_ = wrapped_backward_fn(
|
||||
grad_out,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
out,
|
||||
lse,
|
||||
grad_query,
|
||||
grad_key,
|
||||
grad_value,
|
||||
ctx.dropout_p,
|
||||
ctx.scale,
|
||||
ctx.is_causal,
|
||||
ctx.window_size[0],
|
||||
ctx.window_size[1],
|
||||
ctx.softcap,
|
||||
ctx.alibi_slopes,
|
||||
ctx.deterministic,
|
||||
rng_state,
|
||||
)
|
||||
|
||||
grad_query = grad_query[..., : grad_out.shape[-1]]
|
||||
grad_key = grad_key[..., : grad_out.shape[-1]]
|
||||
grad_value = grad_value[..., : grad_out.shape[-1]]
|
||||
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _flash_attention_3_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
*,
|
||||
window_size: tuple[int, int] = (-1, -1),
|
||||
softcap: float = 0.0,
|
||||
num_splits: int = 1,
|
||||
pack_gqa: bool | None = None,
|
||||
deterministic: bool = False,
|
||||
sm_margin: int = 0,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for flash-attn 3 hub kernels.")
|
||||
if dropout_p != 0.0:
|
||||
raise ValueError("`dropout_p` is not yet supported for flash-attn 3 hub kernels.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for flash-attn 3 hub kernels.")
|
||||
|
||||
config = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB]
|
||||
wrapped_forward_fn = config.wrapped_forward_fn
|
||||
if wrapped_forward_fn is None:
|
||||
raise RuntimeError(
|
||||
"Flash attention 3 hub kernels must expose `flash_attn_interface._flash_attn_forward` "
|
||||
"for context parallel execution."
|
||||
)
|
||||
|
||||
if scale is None:
|
||||
scale = query.shape[-1] ** (-0.5)
|
||||
|
||||
out, softmax_lse, *_ = wrapped_forward_fn(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
None, # k_new, v_new
|
||||
None, # qv
|
||||
None, # out
|
||||
None,
|
||||
None,
|
||||
None, # cu_seqlens_q/k/k_new
|
||||
None,
|
||||
None, # seqused_q/k
|
||||
None,
|
||||
None, # max_seqlen_q/k
|
||||
None,
|
||||
None,
|
||||
None, # page_table, kv_batch_idx, leftpad_k
|
||||
None,
|
||||
None,
|
||||
None, # rotary_cos/sin, seqlens_rotary
|
||||
None,
|
||||
None,
|
||||
None, # q_descale, k_descale, v_descale
|
||||
scale,
|
||||
causal=is_causal,
|
||||
window_size_left=window_size[0],
|
||||
window_size_right=window_size[1],
|
||||
attention_chunk=0,
|
||||
softcap=softcap,
|
||||
num_splits=num_splits,
|
||||
pack_gqa=pack_gqa,
|
||||
sm_margin=sm_margin,
|
||||
)
|
||||
|
||||
lse = softmax_lse.permute(0, 2, 1).contiguous() if return_lse else None
|
||||
|
||||
if _save_ctx:
|
||||
ctx.save_for_backward(query, key, value, out, softmax_lse)
|
||||
ctx.scale = scale
|
||||
ctx.is_causal = is_causal
|
||||
ctx.window_size = window_size
|
||||
ctx.softcap = softcap
|
||||
ctx.deterministic = deterministic
|
||||
ctx.sm_margin = sm_margin
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _flash_attention_3_hub_backward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
grad_out: torch.Tensor,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
config = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB]
|
||||
wrapped_backward_fn = config.wrapped_backward_fn
|
||||
if wrapped_backward_fn is None:
|
||||
raise RuntimeError(
|
||||
"Flash attention 3 hub kernels must expose `flash_attn_interface._flash_attn_backward` "
|
||||
"for context parallel execution."
|
||||
)
|
||||
|
||||
query, key, value, out, softmax_lse = ctx.saved_tensors
|
||||
grad_query = torch.empty_like(query)
|
||||
grad_key = torch.empty_like(key)
|
||||
grad_value = torch.empty_like(value)
|
||||
|
||||
wrapped_backward_fn(
|
||||
grad_out,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
out,
|
||||
softmax_lse,
|
||||
None,
|
||||
None, # cu_seqlens_q, cu_seqlens_k
|
||||
None,
|
||||
None, # seqused_q, seqused_k
|
||||
None,
|
||||
None, # max_seqlen_q, max_seqlen_k
|
||||
grad_query,
|
||||
grad_key,
|
||||
grad_value,
|
||||
ctx.scale,
|
||||
ctx.is_causal,
|
||||
ctx.window_size[0],
|
||||
ctx.window_size[1],
|
||||
ctx.softcap,
|
||||
ctx.deterministic,
|
||||
ctx.sm_margin,
|
||||
)
|
||||
|
||||
grad_query = grad_query[..., : grad_out.shape[-1]]
|
||||
grad_key = grad_key[..., : grad_out.shape[-1]]
|
||||
grad_value = grad_value[..., : grad_out.shape[-1]]
|
||||
|
||||
return grad_query, grad_key, grad_value
|
||||
|
||||
|
||||
def _sage_attention_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
@@ -1445,46 +1109,6 @@ def _sage_attention_forward_op(
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _sage_attention_hub_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
attn_mask: torch.Tensor | None = None,
|
||||
dropout_p: float = 0.0,
|
||||
is_causal: bool = False,
|
||||
scale: float | None = None,
|
||||
enable_gqa: bool = False,
|
||||
return_lse: bool = False,
|
||||
_save_ctx: bool = True,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
):
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not yet supported for Sage attention.")
|
||||
if dropout_p > 0.0:
|
||||
raise ValueError("`dropout_p` is not yet supported for Sage attention.")
|
||||
if enable_gqa:
|
||||
raise ValueError("`enable_gqa` is not yet supported for Sage attention.")
|
||||
|
||||
func = _HUB_KERNELS_REGISTRY[AttentionBackendName.SAGE_HUB].kernel_fn
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
tensor_layout="NHD",
|
||||
is_causal=is_causal,
|
||||
sm_scale=scale,
|
||||
return_lse=return_lse,
|
||||
)
|
||||
|
||||
lse = None
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
lse = lse.permute(0, 2, 1).contiguous()
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
def _sage_attention_backward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
grad_out: torch.Tensor,
|
||||
@@ -1493,26 +1117,6 @@ def _sage_attention_backward_op(
|
||||
raise NotImplementedError("Backward pass is not implemented for Sage attention.")
|
||||
|
||||
|
||||
def _maybe_modify_attn_mask_npu(query: torch.Tensor, key: torch.Tensor, attn_mask: torch.Tensor | None = None):
|
||||
# Skip Attention Mask if all values are 1, `None` mask can speedup the computation
|
||||
if attn_mask is not None and torch.all(attn_mask != 0):
|
||||
attn_mask = None
|
||||
|
||||
# Reshape Attention Mask: [batch_size, seq_len_k] -> [batch_size, 1, sqe_len_q, seq_len_k]
|
||||
# https://www.hiascend.com/document/detail/zh/Pytorch/730/apiref/torchnpuCustomsapi/docs/context/torch_npu-npu_fusion_attention.md
|
||||
if (
|
||||
attn_mask is not None
|
||||
and attn_mask.ndim == 2
|
||||
and attn_mask.shape[0] == query.shape[0]
|
||||
and attn_mask.shape[1] == key.shape[1]
|
||||
):
|
||||
B, Sq, Skv = attn_mask.shape[0], query.shape[1], key.shape[1]
|
||||
attn_mask = ~attn_mask.to(torch.bool)
|
||||
attn_mask = attn_mask.unsqueeze(1).expand(B, Sq, Skv).unsqueeze(1).contiguous()
|
||||
|
||||
return attn_mask
|
||||
|
||||
|
||||
def _npu_attention_forward_op(
|
||||
ctx: torch.autograd.function.FunctionCtx,
|
||||
query: torch.Tensor,
|
||||
@@ -1530,14 +1134,11 @@ def _npu_attention_forward_op(
|
||||
if return_lse:
|
||||
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
|
||||
|
||||
attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
|
||||
|
||||
out = npu_fusion_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query.size(2), # num_heads
|
||||
atten_mask=attn_mask,
|
||||
input_layout="BSND",
|
||||
pse=None,
|
||||
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
|
||||
@@ -2341,7 +1942,7 @@ def _flash_attention(
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.FLASH_HUB,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=True,
|
||||
supports_context_parallel=False,
|
||||
)
|
||||
def _flash_attention_hub(
|
||||
query: torch.Tensor,
|
||||
@@ -2359,35 +1960,17 @@ def _flash_attention_hub(
|
||||
raise ValueError("`attn_mask` is not supported for flash-attn 2.")
|
||||
|
||||
func = _HUB_KERNELS_REGISTRY[AttentionBackendName.FLASH_HUB].kernel_fn
|
||||
if _parallel_config is None:
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
dropout_p=dropout_p,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
else:
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
dropout_p,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_lse,
|
||||
forward_op=_flash_attention_hub_forward_op,
|
||||
backward_op=_flash_attention_hub_backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse = out
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
dropout_p=dropout_p,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
@@ -2534,7 +2117,7 @@ def _flash_attention_3(
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName._FLASH_3_HUB,
|
||||
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=True,
|
||||
supports_context_parallel=False,
|
||||
)
|
||||
def _flash_attention_3_hub(
|
||||
query: torch.Tensor,
|
||||
@@ -2549,68 +2132,33 @@ def _flash_attention_3_hub(
|
||||
return_attn_probs: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if _parallel_config:
|
||||
raise NotImplementedError(f"{AttentionBackendName._FLASH_3_HUB.value} is not implemented for parallelism yet.")
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for flash-attn 3.")
|
||||
|
||||
func = _HUB_KERNELS_REGISTRY[AttentionBackendName._FLASH_3_HUB].kernel_fn
|
||||
if _parallel_config is None:
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
qv=None,
|
||||
q_descale=None,
|
||||
k_descale=None,
|
||||
v_descale=None,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=1,
|
||||
pack_gqa=None,
|
||||
deterministic=deterministic,
|
||||
sm_margin=0,
|
||||
return_attn_probs=return_attn_probs,
|
||||
)
|
||||
return (out[0], out[1]) if return_attn_probs else out
|
||||
|
||||
forward_op = functools.partial(
|
||||
_flash_attention_3_hub_forward_op,
|
||||
out = func(
|
||||
q=query,
|
||||
k=key,
|
||||
v=value,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
qv=None,
|
||||
q_descale=None,
|
||||
k_descale=None,
|
||||
v_descale=None,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=1,
|
||||
pack_gqa=None,
|
||||
deterministic=deterministic,
|
||||
sm_margin=0,
|
||||
return_attn_probs=return_attn_probs,
|
||||
)
|
||||
backward_op = functools.partial(
|
||||
_flash_attention_3_hub_backward_op,
|
||||
window_size=window_size,
|
||||
softcap=softcap,
|
||||
num_splits=1,
|
||||
pack_gqa=None,
|
||||
deterministic=deterministic,
|
||||
sm_margin=0,
|
||||
)
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
0.0,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_attn_probs,
|
||||
forward_op=forward_op,
|
||||
backward_op=backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_attn_probs:
|
||||
out, lse = out
|
||||
return out, lse
|
||||
|
||||
return out
|
||||
# When `return_attn_probs` is True, the above returns a tuple of
|
||||
# actual outputs and lse.
|
||||
return (out[0], out[1]) if return_attn_probs else out
|
||||
|
||||
|
||||
@_AttentionBackendRegistry.register(
|
||||
@@ -2703,7 +2251,7 @@ def _flash_varlen_attention_3(
|
||||
key_packed = torch.cat(key_valid, dim=0)
|
||||
value_packed = torch.cat(value_valid, dim=0)
|
||||
|
||||
result = flash_attn_3_varlen_func(
|
||||
out, lse, *_ = flash_attn_3_varlen_func(
|
||||
q=query_packed,
|
||||
k=key_packed,
|
||||
v=value_packed,
|
||||
@@ -2713,13 +2261,7 @@ def _flash_varlen_attention_3(
|
||||
max_seqlen_k=max_seqlen_k,
|
||||
softmax_scale=scale,
|
||||
causal=is_causal,
|
||||
return_attn_probs=return_lse,
|
||||
)
|
||||
if isinstance(result, tuple):
|
||||
out, lse, *_ = result
|
||||
else:
|
||||
out = result
|
||||
lse = None
|
||||
out = out.unflatten(0, (batch_size, -1))
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
@@ -3126,17 +2668,16 @@ def _native_npu_attention(
|
||||
return_lse: bool = False,
|
||||
_parallel_config: "ParallelConfig" | None = None,
|
||||
) -> torch.Tensor:
|
||||
if attn_mask is not None:
|
||||
raise ValueError("`attn_mask` is not supported for NPU attention")
|
||||
if return_lse:
|
||||
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
|
||||
if _parallel_config is None:
|
||||
attn_mask = _maybe_modify_attn_mask_npu(query, key, attn_mask)
|
||||
|
||||
out = npu_fusion_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query.size(2), # num_heads
|
||||
atten_mask=attn_mask,
|
||||
input_layout="BSND",
|
||||
pse=None,
|
||||
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
|
||||
@@ -3151,7 +2692,7 @@ def _native_npu_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
attn_mask,
|
||||
None,
|
||||
dropout_p,
|
||||
None,
|
||||
scale,
|
||||
@@ -3248,7 +2789,7 @@ def _sage_attention(
|
||||
@_AttentionBackendRegistry.register(
|
||||
AttentionBackendName.SAGE_HUB,
|
||||
constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],
|
||||
supports_context_parallel=True,
|
||||
supports_context_parallel=False,
|
||||
)
|
||||
def _sage_attention_hub(
|
||||
query: torch.Tensor,
|
||||
@@ -3276,23 +2817,6 @@ def _sage_attention_hub(
|
||||
)
|
||||
if return_lse:
|
||||
out, lse, *_ = out
|
||||
else:
|
||||
out = _templated_context_parallel_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
None,
|
||||
0.0,
|
||||
is_causal,
|
||||
scale,
|
||||
False,
|
||||
return_lse,
|
||||
forward_op=_sage_attention_hub_forward_op,
|
||||
backward_op=_sage_attention_backward_op,
|
||||
_parallel_config=_parallel_config,
|
||||
)
|
||||
if return_lse:
|
||||
out, lse = out
|
||||
|
||||
return (out, lse) if return_lse else out
|
||||
|
||||
|
||||
@@ -30,126 +30,10 @@ class AutoModel(ConfigMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise EnvironmentError(
|
||||
f"{self.__class__.__name__} is designed to be instantiated "
|
||||
f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)`, "
|
||||
f"`{self.__class__.__name__}.from_config(config)`, or "
|
||||
f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
|
||||
f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, pretrained_model_name_or_path_or_dict: str | os.PathLike | dict | None = None, **kwargs):
|
||||
r"""
|
||||
Instantiate a model from a config dictionary or a pretrained model configuration file with random weights (no
|
||||
pretrained weights are loaded).
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path_or_dict (`str`, `os.PathLike`, or `dict`):
|
||||
Can be either:
|
||||
|
||||
- A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model
|
||||
configuration hosted on the Hub.
|
||||
- A path to a *directory* (for example `./my_model_directory`) containing a model configuration
|
||||
file.
|
||||
- A config dictionary.
|
||||
|
||||
cache_dir (`Union[str, os.PathLike]`, *optional*):
|
||||
Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
|
||||
is not used.
|
||||
force_download (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to force the (re-)download of the model configuration, overriding the cached version if
|
||||
it exists.
|
||||
proxies (`Dict[str, str]`, *optional*):
|
||||
A dictionary of proxy servers to use by protocol or endpoint.
|
||||
local_files_only(`bool`, *optional*, defaults to `False`):
|
||||
Whether to only load local model configuration files or not.
|
||||
token (`str` or *bool*, *optional*):
|
||||
The token to use as HTTP bearer authorization for remote files.
|
||||
revision (`str`, *optional*, defaults to `"main"`):
|
||||
The specific model version to use.
|
||||
trust_remote_code (`bool`, *optional*, defaults to `False`):
|
||||
Whether to trust remote code.
|
||||
subfolder (`str`, *optional*, defaults to `""`):
|
||||
The subfolder location of a model file within a larger model repository on the Hub or locally.
|
||||
|
||||
Returns:
|
||||
A model object instantiated from the config with random weights.
|
||||
|
||||
Example:
|
||||
|
||||
```py
|
||||
from diffusers import AutoModel
|
||||
|
||||
model = AutoModel.from_config("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
|
||||
```
|
||||
"""
|
||||
subfolder = kwargs.pop("subfolder", None)
|
||||
trust_remote_code = kwargs.pop("trust_remote_code", False)
|
||||
|
||||
hub_kwargs_names = [
|
||||
"cache_dir",
|
||||
"force_download",
|
||||
"local_files_only",
|
||||
"proxies",
|
||||
"revision",
|
||||
"token",
|
||||
]
|
||||
hub_kwargs = {name: kwargs.pop(name, None) for name in hub_kwargs_names}
|
||||
|
||||
if pretrained_model_name_or_path_or_dict is None:
|
||||
raise ValueError(
|
||||
"Please provide a `pretrained_model_name_or_path_or_dict` as the first positional argument."
|
||||
)
|
||||
|
||||
if isinstance(pretrained_model_name_or_path_or_dict, (str, os.PathLike)):
|
||||
pretrained_model_name_or_path = pretrained_model_name_or_path_or_dict
|
||||
config = cls.load_config(pretrained_model_name_or_path, subfolder=subfolder, **hub_kwargs)
|
||||
else:
|
||||
config = pretrained_model_name_or_path_or_dict
|
||||
pretrained_model_name_or_path = config.get("_name_or_path", None)
|
||||
|
||||
has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"]
|
||||
trust_remote_code = resolve_trust_remote_code(
|
||||
trust_remote_code, pretrained_model_name_or_path, has_remote_code
|
||||
)
|
||||
|
||||
if has_remote_code and trust_remote_code:
|
||||
class_ref = config["auto_map"][cls.__name__]
|
||||
module_file, class_name = class_ref.split(".")
|
||||
module_file = module_file + ".py"
|
||||
model_cls = get_class_from_dynamic_module(
|
||||
pretrained_model_name_or_path,
|
||||
subfolder=subfolder,
|
||||
module_file=module_file,
|
||||
class_name=class_name,
|
||||
**hub_kwargs,
|
||||
)
|
||||
else:
|
||||
if "_class_name" in config:
|
||||
class_name = config["_class_name"]
|
||||
library = "diffusers"
|
||||
elif "model_type" in config:
|
||||
class_name = "AutoModel"
|
||||
library = "transformers"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Couldn't find a model class associated with the config: {config}. Make sure the config "
|
||||
"contains a `_class_name` or `model_type` key."
|
||||
)
|
||||
|
||||
from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates
|
||||
|
||||
model_cls, _ = get_class_obj_and_candidates(
|
||||
library_name=library,
|
||||
class_name=class_name,
|
||||
importable_classes=ALL_IMPORTABLE_CLASSES,
|
||||
pipelines=None,
|
||||
is_pipeline_module=False,
|
||||
)
|
||||
|
||||
if model_cls is None:
|
||||
raise ValueError(f"AutoModel can't find a model linked to {class_name}.")
|
||||
|
||||
return model_cls.from_config(config, **kwargs)
|
||||
|
||||
@classmethod
|
||||
@validate_hf_hub_args
|
||||
def from_pretrained(cls, pretrained_model_or_path: str | os.PathLike | None = None, **kwargs):
|
||||
|
||||
@@ -191,12 +191,7 @@ class CosmosControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
dim=1,
|
||||
)
|
||||
|
||||
if condition_mask is not None:
|
||||
control_hidden_states = torch.cat([control_hidden_states, condition_mask], dim=1)
|
||||
else:
|
||||
control_hidden_states = torch.cat(
|
||||
[control_hidden_states, torch.zeros_like(controls_latents[:, :1])], dim=1
|
||||
)
|
||||
control_hidden_states = torch.cat([control_hidden_states, torch.zeros_like(controls_latents[:, :1])], dim=1)
|
||||
|
||||
padding_mask_resized = transforms.functional.resize(
|
||||
padding_mask, list(control_hidden_states.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
|
||||
|
||||
@@ -424,7 +424,7 @@ class Flux2SingleTransformerBlock(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor | None,
|
||||
temb_mod: torch.Tensor,
|
||||
temb_mod_params: tuple[torch.Tensor, torch.Tensor, torch.Tensor],
|
||||
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
|
||||
joint_attention_kwargs: dict[str, Any] | None = None,
|
||||
split_hidden_states: bool = False,
|
||||
@@ -436,7 +436,7 @@ class Flux2SingleTransformerBlock(nn.Module):
|
||||
text_seq_len = encoder_hidden_states.shape[1]
|
||||
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
|
||||
|
||||
mod_shift, mod_scale, mod_gate = Flux2Modulation.split(temb_mod, 1)[0]
|
||||
mod_shift, mod_scale, mod_gate = temb_mod_params
|
||||
|
||||
norm_hidden_states = self.norm(hidden_states)
|
||||
norm_hidden_states = (1 + mod_scale) * norm_hidden_states + mod_shift
|
||||
@@ -498,18 +498,16 @@ class Flux2TransformerBlock(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: torch.Tensor,
|
||||
temb_mod_img: torch.Tensor,
|
||||
temb_mod_txt: torch.Tensor,
|
||||
temb_mod_params_img: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
|
||||
temb_mod_params_txt: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
|
||||
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
|
||||
joint_attention_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
joint_attention_kwargs = joint_attention_kwargs or {}
|
||||
|
||||
# Modulation parameters shape: [1, 1, self.dim]
|
||||
(shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = Flux2Modulation.split(temb_mod_img, 2)
|
||||
(c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = Flux2Modulation.split(
|
||||
temb_mod_txt, 2
|
||||
)
|
||||
(shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = temb_mod_params_img
|
||||
(c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = temb_mod_params_txt
|
||||
|
||||
# Img stream
|
||||
norm_hidden_states = self.norm1(hidden_states)
|
||||
@@ -629,19 +627,15 @@ class Flux2Modulation(nn.Module):
|
||||
self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias)
|
||||
self.act_fn = nn.SiLU()
|
||||
|
||||
def forward(self, temb: torch.Tensor) -> torch.Tensor:
|
||||
def forward(self, temb: torch.Tensor) -> tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
|
||||
mod = self.act_fn(temb)
|
||||
mod = self.linear(mod)
|
||||
return mod
|
||||
|
||||
@staticmethod
|
||||
# split inside the transformer blocks, to avoid passing tuples into checkpoints https://github.com/huggingface/diffusers/issues/12776
|
||||
def split(mod: torch.Tensor, mod_param_sets: int) -> tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
|
||||
if mod.ndim == 2:
|
||||
mod = mod.unsqueeze(1)
|
||||
mod_params = torch.chunk(mod, 3 * mod_param_sets, dim=-1)
|
||||
mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1)
|
||||
# Return tuple of 3-tuples of modulation params shift/scale/gate
|
||||
return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(mod_param_sets))
|
||||
return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets))
|
||||
|
||||
|
||||
class Flux2Transformer2DModel(
|
||||
@@ -830,7 +824,7 @@ class Flux2Transformer2DModel(
|
||||
|
||||
double_stream_mod_img = self.double_stream_modulation_img(temb)
|
||||
double_stream_mod_txt = self.double_stream_modulation_txt(temb)
|
||||
single_stream_mod = self.single_stream_modulation(temb)
|
||||
single_stream_mod = self.single_stream_modulation(temb)[0]
|
||||
|
||||
# 2. Input projection for image (hidden_states) and conditioning text (encoder_hidden_states)
|
||||
hidden_states = self.x_embedder(hidden_states)
|
||||
@@ -867,8 +861,8 @@ class Flux2Transformer2DModel(
|
||||
encoder_hidden_states, hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb_mod_img=double_stream_mod_img,
|
||||
temb_mod_txt=double_stream_mod_txt,
|
||||
temb_mod_params_img=double_stream_mod_img,
|
||||
temb_mod_params_txt=double_stream_mod_txt,
|
||||
image_rotary_emb=concat_rotary_emb,
|
||||
joint_attention_kwargs=joint_attention_kwargs,
|
||||
)
|
||||
@@ -890,7 +884,7 @@ class Flux2Transformer2DModel(
|
||||
hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=None,
|
||||
temb_mod=single_stream_mod,
|
||||
temb_mod_params=single_stream_mod,
|
||||
image_rotary_emb=concat_rotary_emb,
|
||||
joint_attention_kwargs=joint_attention_kwargs,
|
||||
)
|
||||
|
||||
@@ -164,11 +164,7 @@ def compute_text_seq_len_from_mask(
|
||||
position_ids = torch.arange(text_seq_len, device=encoder_hidden_states.device, dtype=torch.long)
|
||||
active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
|
||||
has_active = encoder_hidden_states_mask.any(dim=1)
|
||||
per_sample_len = torch.where(
|
||||
has_active,
|
||||
active_positions.max(dim=1).values + 1,
|
||||
torch.as_tensor(text_seq_len, device=encoder_hidden_states.device),
|
||||
)
|
||||
per_sample_len = torch.where(has_active, active_positions.max(dim=1).values + 1, torch.as_tensor(text_seq_len))
|
||||
return text_seq_len, per_sample_len, encoder_hidden_states_mask
|
||||
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
import importlib
|
||||
import inspect
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
import warnings
|
||||
from collections import OrderedDict
|
||||
@@ -29,16 +28,10 @@ from tqdm.auto import tqdm
|
||||
from typing_extensions import Self
|
||||
|
||||
from ..configuration_utils import ConfigMixin, FrozenDict
|
||||
from ..pipelines.pipeline_loading_utils import (
|
||||
LOADABLE_CLASSES,
|
||||
_fetch_class_library_tuple,
|
||||
_unwrap_model,
|
||||
simple_get_class_obj,
|
||||
)
|
||||
from ..pipelines.pipeline_loading_utils import _fetch_class_library_tuple, simple_get_class_obj
|
||||
from ..utils import PushToHubMixin, is_accelerate_available, logging
|
||||
from ..utils.dynamic_modules_utils import get_class_from_dynamic_module, resolve_trust_remote_code
|
||||
from ..utils.hub_utils import load_or_create_model_card, populate_model_card
|
||||
from ..utils.torch_utils import is_compiled_module
|
||||
from .components_manager import ComponentsManager
|
||||
from .modular_pipeline_utils import (
|
||||
MODULAR_MODEL_CARD_TEMPLATE,
|
||||
@@ -1640,14 +1633,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
blocks_class_name = self.default_blocks_name
|
||||
if blocks_class_name is not None:
|
||||
diffusers_module = importlib.import_module("diffusers")
|
||||
blocks_class = getattr(diffusers_module, blocks_class_name, None)
|
||||
# If the blocks_class is not found or is a base class (e.g. SequentialPipelineBlocks saved by from_blocks_dict) with empty block_classes
|
||||
# fall back to default_blocks_name
|
||||
if blocks_class is None or not blocks_class.block_classes:
|
||||
blocks_class_name = self.default_blocks_name
|
||||
blocks_class = getattr(diffusers_module, blocks_class_name)
|
||||
|
||||
if blocks_class is not None:
|
||||
blocks_class = getattr(diffusers_module, blocks_class_name)
|
||||
blocks = blocks_class()
|
||||
else:
|
||||
logger.warning(f"`blocks` is `None`, no default blocks class found for {self.__class__.__name__}")
|
||||
@@ -1707,8 +1693,6 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
_blocks_class_name=self._blocks.__class__.__name__ if self._blocks is not None else None
|
||||
)
|
||||
|
||||
self._pretrained_model_name_or_path = pretrained_model_name_or_path
|
||||
|
||||
@property
|
||||
def default_call_parameters(self) -> dict[str, Any]:
|
||||
"""
|
||||
@@ -1835,136 +1819,44 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
)
|
||||
return pipeline
|
||||
|
||||
def save_pretrained(
|
||||
self,
|
||||
save_directory: str | os.PathLike,
|
||||
safe_serialization: bool = True,
|
||||
variant: str | None = None,
|
||||
max_shard_size: int | str | None = None,
|
||||
push_to_hub: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = False, **kwargs):
|
||||
"""
|
||||
Save the pipeline and all its components to a directory, so that it can be re-loaded using the
|
||||
[`~ModularPipeline.from_pretrained`] class method.
|
||||
Save the pipeline to a directory. It does not save components, you need to save them separately.
|
||||
|
||||
Args:
|
||||
save_directory (`str` or `os.PathLike`):
|
||||
Directory to save the pipeline to. Will be created if it doesn't exist.
|
||||
safe_serialization (`bool`, *optional*, defaults to `True`):
|
||||
Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
|
||||
variant (`str`, *optional*):
|
||||
If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
|
||||
max_shard_size (`int` or `str`, defaults to `None`):
|
||||
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
|
||||
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
|
||||
If expressed as an integer, the unit is bytes.
|
||||
push_to_hub (`bool`, *optional*, defaults to `False`):
|
||||
Whether to push the pipeline to the Hugging Face model hub after saving it.
|
||||
**kwargs: Additional keyword arguments:
|
||||
- `overwrite_modular_index` (`bool`, *optional*, defaults to `False`):
|
||||
When saving a Modular Pipeline, its components in `modular_model_index.json` may reference repos
|
||||
different from the destination repo. Setting this to `True` updates all component references in
|
||||
`modular_model_index.json` so they point to the repo specified by `repo_id`.
|
||||
- `repo_id` (`str`, *optional*):
|
||||
The repository ID to push the pipeline to. Defaults to the last component of `save_directory`.
|
||||
- `commit_message` (`str`, *optional*):
|
||||
Commit message for the push to hub operation.
|
||||
- `private` (`bool`, *optional*):
|
||||
Whether the repository should be private.
|
||||
- `create_pr` (`bool`, *optional*, defaults to `False`):
|
||||
Whether to create a pull request instead of pushing directly.
|
||||
- `token` (`str`, *optional*):
|
||||
The Hugging Face token to use for authentication.
|
||||
Path to the directory where the pipeline will be saved.
|
||||
push_to_hub (`bool`, optional):
|
||||
Whether to push the pipeline to the huggingface hub.
|
||||
**kwargs: Additional arguments passed to `save_config()` method
|
||||
"""
|
||||
overwrite_modular_index = kwargs.pop("overwrite_modular_index", False)
|
||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||
|
||||
if push_to_hub:
|
||||
commit_message = kwargs.pop("commit_message", None)
|
||||
private = kwargs.pop("private", None)
|
||||
create_pr = kwargs.pop("create_pr", False)
|
||||
token = kwargs.pop("token", None)
|
||||
update_model_card = kwargs.pop("update_model_card", False)
|
||||
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
|
||||
repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
|
||||
|
||||
for component_name, component_spec in self._component_specs.items():
|
||||
if component_spec.default_creation_method != "from_pretrained":
|
||||
continue
|
||||
|
||||
component = getattr(self, component_name, None)
|
||||
if component is None:
|
||||
continue
|
||||
|
||||
model_cls = component.__class__
|
||||
if is_compiled_module(component):
|
||||
component = _unwrap_model(component)
|
||||
model_cls = component.__class__
|
||||
|
||||
save_method_name = None
|
||||
for library_name, library_classes in LOADABLE_CLASSES.items():
|
||||
if library_name in sys.modules:
|
||||
library = importlib.import_module(library_name)
|
||||
else:
|
||||
logger.info(
|
||||
f"{library_name} is not installed. Cannot save {component_name} as {library_classes} from {library_name}"
|
||||
)
|
||||
continue
|
||||
|
||||
for base_class, save_load_methods in library_classes.items():
|
||||
class_candidate = getattr(library, base_class, None)
|
||||
if class_candidate is not None and issubclass(model_cls, class_candidate):
|
||||
save_method_name = save_load_methods[0]
|
||||
break
|
||||
if save_method_name is not None:
|
||||
break
|
||||
|
||||
if save_method_name is None:
|
||||
logger.warning(f"self.{component_name}={component} of type {type(component)} cannot be saved.")
|
||||
continue
|
||||
|
||||
save_method = getattr(component, save_method_name)
|
||||
save_method_signature = inspect.signature(save_method)
|
||||
save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
|
||||
save_method_accept_variant = "variant" in save_method_signature.parameters
|
||||
save_method_accept_max_shard_size = "max_shard_size" in save_method_signature.parameters
|
||||
|
||||
save_kwargs = {}
|
||||
if save_method_accept_safe:
|
||||
save_kwargs["safe_serialization"] = safe_serialization
|
||||
if save_method_accept_variant:
|
||||
save_kwargs["variant"] = variant
|
||||
if save_method_accept_max_shard_size and max_shard_size is not None:
|
||||
save_kwargs["max_shard_size"] = max_shard_size
|
||||
|
||||
component_save_path = os.path.join(save_directory, component_name)
|
||||
save_method(component_save_path, **save_kwargs)
|
||||
|
||||
if component_name not in self.config:
|
||||
continue
|
||||
|
||||
has_no_load_id = not hasattr(component, "_diffusers_load_id") or component._diffusers_load_id == "null"
|
||||
if overwrite_modular_index or has_no_load_id:
|
||||
library, class_name, component_spec_dict = self.config[component_name]
|
||||
component_spec_dict["pretrained_model_name_or_path"] = repo_id if push_to_hub else save_directory
|
||||
component_spec_dict["subfolder"] = component_name
|
||||
self.register_to_config(**{component_name: (library, class_name, component_spec_dict)})
|
||||
|
||||
self.save_config(save_directory=save_directory)
|
||||
|
||||
if push_to_hub:
|
||||
# Generate modular pipeline card content
|
||||
card_content = generate_modular_model_card_content(self.blocks)
|
||||
|
||||
# Create a new empty model card and eventually tag it
|
||||
model_card = load_or_create_model_card(
|
||||
repo_id,
|
||||
token=token,
|
||||
is_pipeline=True,
|
||||
model_description=MODULAR_MODEL_CARD_TEMPLATE.format(**card_content),
|
||||
is_modular=True,
|
||||
update_model_card=update_model_card,
|
||||
)
|
||||
model_card = populate_model_card(model_card, tags=card_content["tags"])
|
||||
|
||||
model_card.save(os.path.join(save_directory, "README.md"))
|
||||
|
||||
# YiYi TODO: maybe order the json file to make it more readable: configs first, then components
|
||||
self.save_config(save_directory=save_directory)
|
||||
|
||||
if push_to_hub:
|
||||
self._upload_folder(
|
||||
save_directory,
|
||||
repo_id,
|
||||
@@ -2232,9 +2124,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Components loaded with `AutoModel.from_pretrained()` or `ComponentSpec.load()` will have
|
||||
loading specs preserved for serialization. Custom or locally loaded components without Hub references will
|
||||
have their `modular_model_index.json` entries updated automatically during `save_pretrained()`.
|
||||
- Components with trained weights should be loaded with `AutoModel.from_pretrained()` or
|
||||
`ComponentSpec.load()` so that loading specs are preserved for serialization.
|
||||
- ConfigMixin objects without weights (e.g., schedulers, guiders) can be passed directly.
|
||||
"""
|
||||
|
||||
@@ -2256,10 +2147,13 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
new_component_spec = current_component_spec
|
||||
if hasattr(self, name) and getattr(self, name) is not None:
|
||||
logger.warning(f"ModularPipeline.update_components: setting {name} to None (spec unchanged)")
|
||||
elif (
|
||||
current_component_spec.default_creation_method == "from_pretrained"
|
||||
and getattr(component, "_diffusers_load_id", None) is None
|
||||
elif current_component_spec.default_creation_method == "from_pretrained" and not (
|
||||
hasattr(component, "_diffusers_load_id") and component._diffusers_load_id is not None
|
||||
):
|
||||
logger.warning(
|
||||
f"ModularPipeline.update_components: {name} has no valid _diffusers_load_id. "
|
||||
f"This will result in empty loading spec, use ComponentSpec.load() for proper specs"
|
||||
)
|
||||
new_component_spec = ComponentSpec(name=name, type_hint=type(component))
|
||||
else:
|
||||
new_component_spec = ComponentSpec.from_component(name, component)
|
||||
@@ -2332,49 +2226,17 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
|
||||
elif "default" in value:
|
||||
# check if the default is specified
|
||||
component_load_kwargs[key] = value["default"]
|
||||
# Only pass trust_remote_code to components from the same repo as the pipeline.
|
||||
# When a user passes trust_remote_code=True, they intend to trust code from the
|
||||
# pipeline's repo, not from external repos referenced in modular_model_index.json.
|
||||
trust_remote_code_stripped = False
|
||||
if (
|
||||
"trust_remote_code" in component_load_kwargs
|
||||
and self._pretrained_model_name_or_path is not None
|
||||
and spec.pretrained_model_name_or_path != self._pretrained_model_name_or_path
|
||||
):
|
||||
component_load_kwargs.pop("trust_remote_code")
|
||||
trust_remote_code_stripped = True
|
||||
|
||||
if not spec.pretrained_model_name_or_path:
|
||||
logger.info(f"Skipping component `{name}`: no pretrained model path specified.")
|
||||
continue
|
||||
|
||||
try:
|
||||
components_to_register[name] = spec.load(**component_load_kwargs)
|
||||
except Exception:
|
||||
tb = traceback.format_exc()
|
||||
if trust_remote_code_stripped and "trust_remote_code" in tb:
|
||||
warning_msg = (
|
||||
f"Failed to load component `{name}` from external repository "
|
||||
f"`{spec.pretrained_model_name_or_path}`.\n\n"
|
||||
f"`trust_remote_code=True` was not forwarded to `{name}` because it comes from "
|
||||
f"a different repository than the pipeline (`{self._pretrained_model_name_or_path}`). "
|
||||
f"For safety, `trust_remote_code` is only forwarded to components from the same "
|
||||
f"repository as the pipeline.\n\n"
|
||||
f"You need to load this component manually with `trust_remote_code=True` and pass it "
|
||||
f"to the pipeline via `pipe.update_components()`. For example, if it is a custom model:\n\n"
|
||||
f' {name} = AutoModel.from_pretrained("{spec.pretrained_model_name_or_path}", trust_remote_code=True)\n'
|
||||
f" pipe.update_components({name}={name})\n"
|
||||
)
|
||||
else:
|
||||
warning_msg = (
|
||||
f"Failed to create component {name}:\n"
|
||||
f"- Component spec: {spec}\n"
|
||||
f"- load() called with kwargs: {component_load_kwargs}\n"
|
||||
"If this component is not required for your workflow you can safely ignore this message.\n\n"
|
||||
"Traceback:\n"
|
||||
f"{tb}"
|
||||
)
|
||||
logger.warning(warning_msg)
|
||||
logger.warning(
|
||||
f"\nFailed to create component {name}:\n"
|
||||
f"- Component spec: {spec}\n"
|
||||
f"- load() called with kwargs: {component_load_kwargs}\n"
|
||||
"If this component is not required for your workflow you can safely ignore this message.\n\n"
|
||||
"Traceback:\n"
|
||||
f"{traceback.format_exc()}"
|
||||
)
|
||||
|
||||
# Register all components at once
|
||||
self.register_components(**components_to_register)
|
||||
|
||||
@@ -50,7 +50,11 @@ This modular pipeline is composed of the following blocks:
|
||||
|
||||
{components_description} {configs_section}
|
||||
|
||||
{io_specification_section}
|
||||
## Input/Output Specification
|
||||
|
||||
### Inputs {inputs_description}
|
||||
|
||||
### Outputs {outputs_description}
|
||||
"""
|
||||
|
||||
|
||||
@@ -307,12 +311,6 @@ class ComponentSpec:
|
||||
f"`type_hint` is required when loading a single file model but is missing for component: {self.name}"
|
||||
)
|
||||
|
||||
# `torch_dtype` is not an accepted parameter for tokenizers and processors.
|
||||
# As a result, it gets stored in `init_kwargs`, which are written to the config
|
||||
# during save. This causes JSON serialization to fail when saving the component.
|
||||
if self.type_hint is not None and not issubclass(self.type_hint, torch.nn.Module):
|
||||
kwargs.pop("torch_dtype", None)
|
||||
|
||||
if self.type_hint is None:
|
||||
try:
|
||||
from diffusers import AutoModel
|
||||
@@ -330,12 +328,6 @@ class ComponentSpec:
|
||||
else getattr(self.type_hint, "from_pretrained")
|
||||
)
|
||||
|
||||
# `torch_dtype` is not an accepted parameter for tokenizers and processors.
|
||||
# As a result, it gets stored in `init_kwargs`, which are written to the config
|
||||
# during save. This causes JSON serialization to fail when saving the component.
|
||||
if not issubclass(self.type_hint, torch.nn.Module):
|
||||
kwargs.pop("torch_dtype", None)
|
||||
|
||||
try:
|
||||
component = load_method(pretrained_model_name_or_path, **load_kwargs, **kwargs)
|
||||
except Exception as e:
|
||||
@@ -807,46 +799,6 @@ def format_output_params(output_params, indent_level=4, max_line_length=115):
|
||||
return format_params(output_params, "Outputs", indent_level, max_line_length)
|
||||
|
||||
|
||||
def format_params_markdown(params, header="Inputs"):
|
||||
"""Format a list of InputParam or OutputParam objects as a markdown bullet-point list.
|
||||
|
||||
Suitable for model cards rendered on Hugging Face Hub.
|
||||
|
||||
Args:
|
||||
params: list of InputParam or OutputParam objects to format
|
||||
header: Header text (e.g. "Inputs" or "Outputs")
|
||||
|
||||
Returns:
|
||||
A formatted markdown string, or empty string if params is empty.
|
||||
"""
|
||||
if not params:
|
||||
return ""
|
||||
|
||||
def get_type_str(type_hint):
|
||||
if isinstance(type_hint, UnionType) or get_origin(type_hint) is Union:
|
||||
type_strs = [t.__name__ if hasattr(t, "__name__") else str(t) for t in get_args(type_hint)]
|
||||
return " | ".join(type_strs)
|
||||
return type_hint.__name__ if hasattr(type_hint, "__name__") else str(type_hint)
|
||||
|
||||
lines = [f"**{header}:**\n"] if header else []
|
||||
for param in params:
|
||||
type_str = get_type_str(param.type_hint) if param.type_hint != Any else ""
|
||||
name = f"**{param.kwargs_type}" if param.name is None and param.kwargs_type is not None else param.name
|
||||
param_str = f"- `{name}` (`{type_str}`"
|
||||
|
||||
if hasattr(param, "required") and not param.required:
|
||||
param_str += ", *optional*"
|
||||
if param.default is not None:
|
||||
param_str += f", defaults to `{param.default}`"
|
||||
param_str += ")"
|
||||
|
||||
desc = param.description if param.description else "No description provided"
|
||||
param_str += f": {desc}"
|
||||
lines.append(param_str)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_components(components, indent_level=4, max_line_length=115, add_empty_lines=True):
|
||||
"""Format a list of ComponentSpec objects into a readable string representation.
|
||||
|
||||
@@ -1103,7 +1055,8 @@ def generate_modular_model_card_content(blocks) -> dict[str, Any]:
|
||||
- blocks_description: Detailed architecture of blocks
|
||||
- components_description: List of required components
|
||||
- configs_section: Configuration parameters section
|
||||
- io_specification_section: Input/Output specification (per-workflow or unified)
|
||||
- inputs_description: Input parameters specification
|
||||
- outputs_description: Output parameters specification
|
||||
- trigger_inputs_section: Conditional execution information
|
||||
- tags: List of relevant tags for the model card
|
||||
"""
|
||||
@@ -1122,6 +1075,15 @@ def generate_modular_model_card_content(blocks) -> dict[str, Any]:
|
||||
if block_desc:
|
||||
blocks_desc_parts.append(f" - {block_desc}")
|
||||
|
||||
# add sub-blocks if any
|
||||
if hasattr(block, "sub_blocks") and block.sub_blocks:
|
||||
for sub_name, sub_block in block.sub_blocks.items():
|
||||
sub_class = sub_block.__class__.__name__
|
||||
sub_desc = sub_block.description.split("\n")[0] if getattr(sub_block, "description", "") else ""
|
||||
blocks_desc_parts.append(f" - *{sub_name}*: `{sub_class}`")
|
||||
if sub_desc:
|
||||
blocks_desc_parts.append(f" - {sub_desc}")
|
||||
|
||||
blocks_description = "\n".join(blocks_desc_parts) if blocks_desc_parts else "No blocks defined."
|
||||
|
||||
components = getattr(blocks, "expected_components", [])
|
||||
@@ -1147,76 +1109,63 @@ def generate_modular_model_card_content(blocks) -> dict[str, Any]:
|
||||
if configs_description:
|
||||
configs_section = f"\n\n## Configuration Parameters\n\n{configs_description}"
|
||||
|
||||
# Branch on whether workflows are defined
|
||||
has_workflows = getattr(blocks, "_workflow_map", None) is not None
|
||||
inputs = blocks.inputs
|
||||
outputs = blocks.outputs
|
||||
|
||||
if has_workflows:
|
||||
workflow_map = blocks._workflow_map
|
||||
parts = []
|
||||
# format inputs as markdown list
|
||||
inputs_parts = []
|
||||
required_inputs = [inp for inp in inputs if inp.required]
|
||||
optional_inputs = [inp for inp in inputs if not inp.required]
|
||||
|
||||
# If blocks overrides outputs (e.g. to return just "images" instead of all intermediates),
|
||||
# use that as the shared output for all workflows
|
||||
blocks_outputs = blocks.outputs
|
||||
blocks_intermediate = getattr(blocks, "intermediate_outputs", None)
|
||||
shared_outputs = (
|
||||
blocks_outputs if blocks_intermediate is not None and blocks_outputs != blocks_intermediate else None
|
||||
)
|
||||
if required_inputs:
|
||||
inputs_parts.append("**Required:**\n")
|
||||
for inp in required_inputs:
|
||||
if hasattr(inp.type_hint, "__name__"):
|
||||
type_str = inp.type_hint.__name__
|
||||
elif inp.type_hint is not None:
|
||||
type_str = str(inp.type_hint).replace("typing.", "")
|
||||
else:
|
||||
type_str = "Any"
|
||||
desc = inp.description or "No description provided"
|
||||
inputs_parts.append(f"- `{inp.name}` (`{type_str}`): {desc}")
|
||||
|
||||
parts.append("## Workflow Input Specification\n")
|
||||
if optional_inputs:
|
||||
if required_inputs:
|
||||
inputs_parts.append("")
|
||||
inputs_parts.append("**Optional:**\n")
|
||||
for inp in optional_inputs:
|
||||
if hasattr(inp.type_hint, "__name__"):
|
||||
type_str = inp.type_hint.__name__
|
||||
elif inp.type_hint is not None:
|
||||
type_str = str(inp.type_hint).replace("typing.", "")
|
||||
else:
|
||||
type_str = "Any"
|
||||
desc = inp.description or "No description provided"
|
||||
default_str = f", default: `{inp.default}`" if inp.default is not None else ""
|
||||
inputs_parts.append(f"- `{inp.name}` (`{type_str}`){default_str}: {desc}")
|
||||
|
||||
# Per-workflow details: show trigger inputs with full param descriptions
|
||||
for wf_name, trigger_inputs in workflow_map.items():
|
||||
trigger_input_names = set(trigger_inputs.keys())
|
||||
try:
|
||||
workflow_blocks = blocks.get_workflow(wf_name)
|
||||
except Exception:
|
||||
parts.append(f"<details>\n<summary><strong>{wf_name}</strong></summary>\n")
|
||||
parts.append("*Could not resolve workflow blocks.*\n")
|
||||
parts.append("</details>\n")
|
||||
continue
|
||||
inputs_description = "\n".join(inputs_parts) if inputs_parts else "No specific inputs defined."
|
||||
|
||||
wf_inputs = workflow_blocks.inputs
|
||||
# Show only trigger inputs with full parameter descriptions
|
||||
trigger_params = [p for p in wf_inputs if p.name in trigger_input_names]
|
||||
# format outputs as markdown list
|
||||
outputs_parts = []
|
||||
for out in outputs:
|
||||
if hasattr(out.type_hint, "__name__"):
|
||||
type_str = out.type_hint.__name__
|
||||
elif out.type_hint is not None:
|
||||
type_str = str(out.type_hint).replace("typing.", "")
|
||||
else:
|
||||
type_str = "Any"
|
||||
desc = out.description or "No description provided"
|
||||
outputs_parts.append(f"- `{out.name}` (`{type_str}`): {desc}")
|
||||
|
||||
parts.append(f"<details>\n<summary><strong>{wf_name}</strong></summary>\n")
|
||||
outputs_description = "\n".join(outputs_parts) if outputs_parts else "Standard pipeline outputs."
|
||||
|
||||
inputs_str = format_params_markdown(trigger_params, header=None)
|
||||
parts.append(inputs_str if inputs_str else "No additional inputs required.")
|
||||
parts.append("")
|
||||
|
||||
parts.append("</details>\n")
|
||||
|
||||
# Common Inputs & Outputs section (like non-workflow pipelines)
|
||||
all_inputs = blocks.inputs
|
||||
all_outputs = shared_outputs if shared_outputs is not None else blocks.outputs
|
||||
|
||||
inputs_str = format_params_markdown(all_inputs, "Inputs")
|
||||
outputs_str = format_params_markdown(all_outputs, "Outputs")
|
||||
inputs_description = inputs_str if inputs_str else "No specific inputs defined."
|
||||
outputs_description = outputs_str if outputs_str else "Standard pipeline outputs."
|
||||
|
||||
parts.append(f"\n## Input/Output Specification\n\n{inputs_description}\n\n{outputs_description}")
|
||||
|
||||
io_specification_section = "\n".join(parts)
|
||||
# Suppress trigger_inputs_section when workflows are shown (it's redundant)
|
||||
trigger_inputs_section = ""
|
||||
else:
|
||||
# Unified I/O section (original behavior)
|
||||
inputs = blocks.inputs
|
||||
outputs = blocks.outputs
|
||||
inputs_str = format_params_markdown(inputs, "Inputs")
|
||||
outputs_str = format_params_markdown(outputs, "Outputs")
|
||||
inputs_description = inputs_str if inputs_str else "No specific inputs defined."
|
||||
outputs_description = outputs_str if outputs_str else "Standard pipeline outputs."
|
||||
io_specification_section = f"## Input/Output Specification\n\n{inputs_description}\n\n{outputs_description}"
|
||||
|
||||
trigger_inputs_section = ""
|
||||
if hasattr(blocks, "trigger_inputs") and blocks.trigger_inputs:
|
||||
trigger_inputs_list = sorted([t for t in blocks.trigger_inputs if t is not None])
|
||||
if trigger_inputs_list:
|
||||
trigger_inputs_str = ", ".join(f"`{t}`" for t in trigger_inputs_list)
|
||||
trigger_inputs_section = f"""
|
||||
trigger_inputs_section = ""
|
||||
if hasattr(blocks, "trigger_inputs") and blocks.trigger_inputs:
|
||||
trigger_inputs_list = sorted([t for t in blocks.trigger_inputs if t is not None])
|
||||
if trigger_inputs_list:
|
||||
trigger_inputs_str = ", ".join(f"`{t}`" for t in trigger_inputs_list)
|
||||
trigger_inputs_section = f"""
|
||||
### Conditional Execution
|
||||
|
||||
This pipeline contains blocks that are selected at runtime based on inputs:
|
||||
@@ -1229,18 +1178,7 @@ This pipeline contains blocks that are selected at runtime based on inputs:
|
||||
if hasattr(blocks, "model_name") and blocks.model_name:
|
||||
tags.append(blocks.model_name)
|
||||
|
||||
if has_workflows:
|
||||
# Derive tags from workflow names
|
||||
workflow_names = set(blocks._workflow_map.keys())
|
||||
if any("inpainting" in wf for wf in workflow_names):
|
||||
tags.append("inpainting")
|
||||
if any("image2image" in wf for wf in workflow_names):
|
||||
tags.append("image-to-image")
|
||||
if any("controlnet" in wf for wf in workflow_names):
|
||||
tags.append("controlnet")
|
||||
if any("text2image" in wf for wf in workflow_names):
|
||||
tags.append("text-to-image")
|
||||
elif hasattr(blocks, "trigger_inputs") and blocks.trigger_inputs:
|
||||
if hasattr(blocks, "trigger_inputs") and blocks.trigger_inputs:
|
||||
triggers = blocks.trigger_inputs
|
||||
if any(t in triggers for t in ["mask", "mask_image"]):
|
||||
tags.append("inpainting")
|
||||
@@ -1268,7 +1206,8 @@ This pipeline uses a {block_count}-block architecture that can be customized and
|
||||
"blocks_description": blocks_description,
|
||||
"components_description": components_description,
|
||||
"configs_section": configs_section,
|
||||
"io_specification_section": io_specification_section,
|
||||
"inputs_description": inputs_description,
|
||||
"outputs_description": outputs_description,
|
||||
"trigger_inputs_section": trigger_inputs_section,
|
||||
"tags": tags,
|
||||
}
|
||||
|
||||
@@ -502,10 +502,6 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
||||
text_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
)
|
||||
# Extract the pooler output if it's a BaseModelOutputWithPooling (Transformers v5+)
|
||||
# otherwise use it directly (Transformers v4)
|
||||
if hasattr(prompt_embeds, "pooler_output"):
|
||||
prompt_embeds = prompt_embeds.pooler_output
|
||||
# append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
|
||||
prompt_embeds = prompt_embeds[:, None, :]
|
||||
# make sure that we attend to this single hidden-state
|
||||
@@ -614,10 +610,6 @@ class AudioLDM2Pipeline(DiffusionPipeline):
|
||||
uncond_input_ids,
|
||||
attention_mask=negative_attention_mask,
|
||||
)
|
||||
# Extract the pooler output if it's a BaseModelOutputWithPooling (Transformers v5+)
|
||||
# otherwise use it directly (Transformers v4)
|
||||
if hasattr(negative_prompt_embeds, "pooler_output"):
|
||||
negative_prompt_embeds = negative_prompt_embeds.pooler_output
|
||||
# append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
|
||||
negative_prompt_embeds = negative_prompt_embeds[:, None, :]
|
||||
# make sure that we attend to this single hidden-state
|
||||
|
||||
@@ -287,9 +287,6 @@ class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
)
|
||||
input_ids = (
|
||||
input_ids["input_ids"] if not isinstance(input_ids, list) and "input_ids" in input_ids else input_ids
|
||||
)
|
||||
input_ids = torch.LongTensor(input_ids)
|
||||
input_ids_batch.append(input_ids)
|
||||
|
||||
|
||||
@@ -17,6 +17,9 @@ from typing import Callable, Dict, List, Optional, Union
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
import torchvision
|
||||
import torchvision.transforms
|
||||
import torchvision.transforms.functional
|
||||
from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
@@ -51,13 +54,11 @@ else:
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def _maybe_pad_or_trim_video(video: torch.Tensor, num_frames: int):
|
||||
def _maybe_pad_video(video: torch.Tensor, num_frames: int):
|
||||
n_pad_frames = num_frames - video.shape[2]
|
||||
if n_pad_frames > 0:
|
||||
last_frame = video[:, :, -1:, :, :]
|
||||
video = torch.cat((video, last_frame.repeat(1, 1, n_pad_frames, 1, 1)), dim=2)
|
||||
elif num_frames < video.shape[2]:
|
||||
video = video[:, :, :num_frames, :, :]
|
||||
return video
|
||||
|
||||
|
||||
@@ -133,8 +134,8 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> controls = [Image.fromarray(x.numpy()) for x in controls.permute(1, 2, 3, 0)]
|
||||
>>> export_to_video(controls, "edge_controlled_video_edge.mp4", fps=30)
|
||||
|
||||
>>> # Transfer inference with controls.
|
||||
>>> video = pipe(
|
||||
... video=input_video[:num_frames],
|
||||
... controls=controls,
|
||||
... controls_conditioning_scale=1.0,
|
||||
... prompt=prompt,
|
||||
@@ -148,7 +149,7 @@ EXAMPLE_DOC_STRING = """
|
||||
|
||||
class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Pipeline for Cosmos Transfer2.5, supporting auto-regressive inference.
|
||||
Pipeline for Cosmos Transfer2.5 base model.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
||||
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
||||
@@ -165,14 +166,12 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
||||
vae ([`AutoencoderKLWan`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
|
||||
controlnet ([`CosmosControlNetModel`]):
|
||||
ControlNet used to condition generation on control inputs.
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder->transformer->controlnet->vae"
|
||||
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
# We mark safety_checker as optional here to get around some test failures, but it is not really optional
|
||||
_optional_components = ["safety_checker"]
|
||||
_optional_components = ["safety_checker", "controlnet"]
|
||||
_exclude_from_cpu_offload = ["safety_checker"]
|
||||
|
||||
def __init__(
|
||||
@@ -182,8 +181,8 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
transformer: CosmosTransformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
controlnet: CosmosControlNetModel,
|
||||
safety_checker: Optional[CosmosSafetyChecker] = None,
|
||||
controlnet: Optional[CosmosControlNetModel],
|
||||
safety_checker: CosmosSafetyChecker = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@@ -263,9 +262,6 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
)
|
||||
input_ids = (
|
||||
input_ids["input_ids"] if not isinstance(input_ids, list) and "input_ids" in input_ids else input_ids
|
||||
)
|
||||
input_ids = torch.LongTensor(input_ids)
|
||||
input_ids_batch.append(input_ids)
|
||||
|
||||
@@ -385,11 +381,10 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
num_frames_in: int = 93,
|
||||
num_frames_out: int = 93,
|
||||
do_classifier_free_guidance: bool = True,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
device: Optional[torch.device] = None,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
num_cond_latent_frames: int = 0,
|
||||
dtype: torch.dtype | None = None,
|
||||
device: torch.device | None = None,
|
||||
generator: torch.Generator | list[torch.Generator] | None = None,
|
||||
latents: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
@@ -404,14 +399,10 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
W = width // self.vae_scale_factor_spatial
|
||||
shape = (B, C, T, H, W)
|
||||
|
||||
if latents is not None:
|
||||
if latents.shape[1:] != shape[1:]:
|
||||
raise ValueError(f"Unexpected `latents` shape, got {latents.shape}, expected {shape}.")
|
||||
latents = latents.to(device=device, dtype=dtype)
|
||||
else:
|
||||
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
|
||||
if num_frames_in == 0:
|
||||
if latents is None:
|
||||
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
|
||||
cond_mask = torch.zeros((B, 1, T, H, W), dtype=latents.dtype, device=latents.device)
|
||||
cond_indicator = torch.zeros((B, 1, T, 1, 1), dtype=latents.dtype, device=latents.device)
|
||||
|
||||
@@ -441,12 +432,16 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
latents_std = self.latents_std.to(device=device, dtype=dtype)
|
||||
cond_latents = (cond_latents - latents_mean) / latents_std
|
||||
|
||||
if latents is None:
|
||||
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
else:
|
||||
latents = latents.to(device=device, dtype=dtype)
|
||||
|
||||
padding_shape = (B, 1, T, H, W)
|
||||
ones_padding = latents.new_ones(padding_shape)
|
||||
zeros_padding = latents.new_zeros(padding_shape)
|
||||
|
||||
cond_indicator = latents.new_zeros(B, 1, latents.size(2), 1, 1)
|
||||
cond_indicator[:, :, 0:num_cond_latent_frames, :, :] = 1.0
|
||||
cond_indicator = latents.new_zeros(1, 1, latents.size(2), 1, 1)
|
||||
cond_mask = cond_indicator * ones_padding + (1 - cond_indicator) * zeros_padding
|
||||
|
||||
return (
|
||||
@@ -456,7 +451,34 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
cond_indicator,
|
||||
)
|
||||
|
||||
# Modified from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.check_inputs
|
||||
def _encode_controls(
|
||||
self,
|
||||
controls: Optional[torch.Tensor],
|
||||
height: int,
|
||||
width: int,
|
||||
num_frames: int,
|
||||
dtype: torch.dtype,
|
||||
device: torch.device,
|
||||
generator: torch.Generator | list[torch.Generator] | None,
|
||||
) -> Optional[torch.Tensor]:
|
||||
if controls is None:
|
||||
return None
|
||||
|
||||
control_video = self.video_processor.preprocess_video(controls, height, width)
|
||||
control_video = _maybe_pad_video(control_video, num_frames)
|
||||
|
||||
control_video = control_video.to(device=device, dtype=self.vae.dtype)
|
||||
control_latents = [
|
||||
retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator=generator) for vid in control_video
|
||||
]
|
||||
control_latents = torch.cat(control_latents, dim=0).to(dtype)
|
||||
|
||||
latents_mean = self.latents_mean.to(device=device, dtype=dtype)
|
||||
latents_std = self.latents_std.to(device=device, dtype=dtype)
|
||||
control_latents = (control_latents - latents_mean) / latents_std
|
||||
return control_latents
|
||||
|
||||
# Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.check_inputs
|
||||
def check_inputs(
|
||||
self,
|
||||
prompt,
|
||||
@@ -464,25 +486,9 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
width,
|
||||
prompt_embeds=None,
|
||||
callback_on_step_end_tensor_inputs=None,
|
||||
num_ar_conditional_frames=None,
|
||||
num_ar_latent_conditional_frames=None,
|
||||
num_frames_per_chunk=None,
|
||||
num_frames=None,
|
||||
conditional_frame_timestep=0.1,
|
||||
):
|
||||
if width <= 0 or height <= 0 or height % 16 != 0 or width % 16 != 0:
|
||||
raise ValueError(
|
||||
f"`height` and `width` have to be divisible by 16 (& positive) but are {height} and {width}."
|
||||
)
|
||||
|
||||
if num_frames is not None and num_frames <= 0:
|
||||
raise ValueError(f"`num_frames` has to be a positive integer when provided but is {num_frames}.")
|
||||
|
||||
if conditional_frame_timestep < 0 or conditional_frame_timestep > 1:
|
||||
raise ValueError(
|
||||
"`conditional_frame_timestep` has to be a float in the [0, 1] interval but is "
|
||||
f"{conditional_frame_timestep}."
|
||||
)
|
||||
if height % 16 != 0 or width % 16 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
|
||||
|
||||
if callback_on_step_end_tensor_inputs is not None and not all(
|
||||
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
|
||||
@@ -503,46 +509,6 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
|
||||
if num_ar_latent_conditional_frames is not None and num_ar_conditional_frames is not None:
|
||||
raise ValueError(
|
||||
"Provide only one of `num_ar_conditional_frames` or `num_ar_latent_conditional_frames`, not both."
|
||||
)
|
||||
if num_ar_latent_conditional_frames is None and num_ar_conditional_frames is None:
|
||||
raise ValueError("Provide either `num_ar_conditional_frames` or `num_ar_latent_conditional_frames`.")
|
||||
if num_ar_latent_conditional_frames is not None and num_ar_latent_conditional_frames < 0:
|
||||
raise ValueError("`num_ar_latent_conditional_frames` must be >= 0.")
|
||||
if num_ar_conditional_frames is not None and num_ar_conditional_frames < 0:
|
||||
raise ValueError("`num_ar_conditional_frames` must be >= 0.")
|
||||
|
||||
if num_ar_latent_conditional_frames is not None:
|
||||
num_ar_conditional_frames = max(
|
||||
0, (num_ar_latent_conditional_frames - 1) * self.vae_scale_factor_temporal + 1
|
||||
)
|
||||
|
||||
min_chunk_len = self.vae_scale_factor_temporal + 1
|
||||
if num_frames_per_chunk < min_chunk_len:
|
||||
logger.warning(f"{num_frames_per_chunk=} must be larger than {min_chunk_len=}, setting to min_chunk_len")
|
||||
num_frames_per_chunk = min_chunk_len
|
||||
|
||||
max_frames_by_rope = None
|
||||
if getattr(self.transformer.config, "max_size", None) is not None:
|
||||
max_frames_by_rope = max(
|
||||
size // patch
|
||||
for size, patch in zip(self.transformer.config.max_size, self.transformer.config.patch_size)
|
||||
)
|
||||
if num_frames_per_chunk > max_frames_by_rope:
|
||||
raise ValueError(
|
||||
f"{num_frames_per_chunk=} is too large for RoPE setting ({max_frames_by_rope=}). "
|
||||
"Please reduce `num_frames_per_chunk`."
|
||||
)
|
||||
|
||||
if num_ar_conditional_frames >= num_frames_per_chunk:
|
||||
raise ValueError(
|
||||
f"{num_ar_conditional_frames=} must be smaller than {num_frames_per_chunk=} for chunked generation."
|
||||
)
|
||||
|
||||
return num_frames_per_chunk
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
return self._guidance_scale
|
||||
@@ -567,22 +533,23 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
controls: PipelineImageInput | List[PipelineImageInput],
|
||||
controls_conditioning_scale: Union[float, List[float]] = 1.0,
|
||||
image: PipelineImageInput | None = None,
|
||||
video: List[PipelineImageInput] | None = None,
|
||||
prompt: Union[str, List[str]] | None = None,
|
||||
negative_prompt: Union[str, List[str]] = DEFAULT_NEGATIVE_PROMPT,
|
||||
height: int = 704,
|
||||
width: Optional[int] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
num_frames_per_chunk: int = 93,
|
||||
width: int | None = None,
|
||||
num_frames: int = 93,
|
||||
num_inference_steps: int = 36,
|
||||
guidance_scale: float = 3.0,
|
||||
num_videos_per_prompt: int = 1,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
prompt_embeds: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds: Optional[torch.Tensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
num_videos_per_prompt: Optional[int] = 1,
|
||||
generator: torch.Generator | list[torch.Generator] | None = None,
|
||||
latents: torch.Tensor | None = None,
|
||||
controls: Optional[PipelineImageInput | List[PipelineImageInput]] = None,
|
||||
controls_conditioning_scale: float | list[float] = 1.0,
|
||||
prompt_embeds: torch.Tensor | None = None,
|
||||
negative_prompt_embeds: torch.Tensor | None = None,
|
||||
output_type: str = "pil",
|
||||
return_dict: bool = True,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
@@ -590,26 +557,24 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
max_sequence_length: int = 512,
|
||||
conditional_frame_timestep: float = 0.1,
|
||||
num_ar_conditional_frames: Optional[int] = 1,
|
||||
num_ar_latent_conditional_frames: Optional[int] = None,
|
||||
):
|
||||
r"""
|
||||
`controls` drive the conditioning through ControlNet. Controls are assumed to be pre-processed, e.g. edge maps
|
||||
are pre-computed.
|
||||
The call function to the pipeline for generation. Supports three modes:
|
||||
|
||||
Setting `num_frames` will restrict the total number of frames output, if not provided or assigned to None
|
||||
(default) then the number of output frames will match the input `controls`.
|
||||
- **Text2World**: `image=None`, `video=None`, `prompt` provided. Generates a world clip.
|
||||
- **Image2World**: `image` provided, `video=None`, `prompt` provided. Conditions on a single frame.
|
||||
- **Video2World**: `video` provided, `image=None`, `prompt` provided. Conditions on an input clip.
|
||||
|
||||
Auto-regressive inference is supported and thus a sliding window of `num_frames_per_chunk` frames are used per
|
||||
denoising loop. In addition, when auto-regressive inference is performed, the previous
|
||||
`num_ar_latent_conditional_frames` or `num_ar_conditional_frames` are used to condition the following denoising
|
||||
inference loops.
|
||||
Set `num_frames=93` (default) to produce a world video, or `num_frames=1` to produce a single image frame (the
|
||||
above in "*2Image mode").
|
||||
|
||||
Outputs follow `output_type` (e.g., `"pil"` returns a list of `num_frames` PIL images per prompt).
|
||||
|
||||
Args:
|
||||
controls (`PipelineImageInput`, `List[PipelineImageInput]`):
|
||||
Control image or video input used by the ControlNet.
|
||||
controls_conditioning_scale (`float` or `List[float]`, *optional*, defaults to `1.0`):
|
||||
The scale factor(s) for the ControlNet outputs. A single float is broadcast to all control blocks.
|
||||
image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, *optional*):
|
||||
Optional single image for Image2World conditioning. Must be `None` when `video` is provided.
|
||||
video (`List[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, *optional*):
|
||||
Optional input video for Video2World conditioning. Must be `None` when `image` is provided.
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide generation. Required unless `prompt_embeds` is supplied.
|
||||
height (`int`, defaults to `704`):
|
||||
@@ -617,10 +582,9 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image. If not provided, this will be determined based on the
|
||||
aspect ratio of the input and the provided height.
|
||||
num_frames (`int`, *optional*):
|
||||
Number of output frames. Defaults to `None` to output the same number of frames as the input
|
||||
`controls`.
|
||||
num_inference_steps (`int`, defaults to `36`):
|
||||
num_frames (`int`, defaults to `93`):
|
||||
Number of output frames. Use `93` for world (video) generation; set to `1` to return a single frame.
|
||||
num_inference_steps (`int`, defaults to `35`):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `3.0`):
|
||||
@@ -634,9 +598,13 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
||||
generation deterministic.
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs. Can be used to
|
||||
tweak the same generation with different prompts. If not provided, a latents tensor is generated by
|
||||
sampling using the supplied random `generator`.
|
||||
Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor is generated by sampling using the supplied random `generator`.
|
||||
controls (`PipelineImageInput`, `List[PipelineImageInput]`, *optional*):
|
||||
Control image or video input used by the ControlNet. If `None`, ControlNet is skipped.
|
||||
controls_conditioning_scale (`float` or `List[float]`, *optional*, defaults to `1.0`):
|
||||
The scale factor(s) for the ControlNet outputs. A single float is broadcast to all control blocks.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
@@ -659,18 +627,7 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
max_sequence_length (`int`, defaults to `512`):
|
||||
The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If
|
||||
the prompt is shorter than this length, it will be padded.
|
||||
num_ar_conditional_frames (`int`, *optional*, defaults to `1`):
|
||||
Number of frames to condition on subsequent inference loops in auto-regressive inference, i.e. for the
|
||||
second chunk and onwards. Only used if `num_ar_latent_conditional_frames` is `None`.
|
||||
|
||||
This is only used when auto-regressive inference is performed, i.e. when the number of frames in
|
||||
controls is > num_frames_per_chunk
|
||||
num_ar_latent_conditional_frames (`int`, *optional*):
|
||||
Number of latent frames to condition on subsequent inference loops in auto-regressive inference, i.e.
|
||||
for the second chunk and onwards. Only used if `num_ar_conditional_frames` is `None`.
|
||||
|
||||
This is only used when auto-regressive inference is performed, i.e. when the number of frames in
|
||||
controls is > num_frames_per_chunk
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
@@ -690,40 +647,21 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
|
||||
if width is None:
|
||||
frame = controls[0] if isinstance(controls, list) else controls
|
||||
if isinstance(frame, list):
|
||||
frame = frame[0]
|
||||
if isinstance(frame, (torch.Tensor, np.ndarray)):
|
||||
if frame.ndim == 5:
|
||||
frame = frame[0, 0]
|
||||
elif frame.ndim == 4:
|
||||
frame = frame[0]
|
||||
frame = image or video[0] if image or video else None
|
||||
if frame is None and controls is not None:
|
||||
frame = controls[0] if isinstance(controls, list) else controls
|
||||
if isinstance(frame, (torch.Tensor, np.ndarray)) and len(frame.shape) == 4:
|
||||
frame = controls[0]
|
||||
|
||||
if isinstance(frame, PIL.Image.Image):
|
||||
if frame is None:
|
||||
width = int((height + 16) * (1280 / 720))
|
||||
elif isinstance(frame, PIL.Image.Image):
|
||||
width = int((height + 16) * (frame.width / frame.height))
|
||||
else:
|
||||
if frame.ndim != 3:
|
||||
raise ValueError("`controls` must contain 3D frames in CHW format.")
|
||||
width = int((height + 16) * (frame.shape[2] / frame.shape[1])) # NOTE: assuming C H W
|
||||
|
||||
num_frames_per_chunk = self.check_inputs(
|
||||
prompt,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds,
|
||||
callback_on_step_end_tensor_inputs,
|
||||
num_ar_conditional_frames,
|
||||
num_ar_latent_conditional_frames,
|
||||
num_frames_per_chunk,
|
||||
num_frames,
|
||||
conditional_frame_timestep,
|
||||
)
|
||||
|
||||
if num_ar_latent_conditional_frames is not None:
|
||||
num_cond_latent_frames = num_ar_latent_conditional_frames
|
||||
num_ar_conditional_frames = max(0, (num_cond_latent_frames - 1) * self.vae_scale_factor_temporal + 1)
|
||||
else:
|
||||
num_cond_latent_frames = max(0, (num_ar_conditional_frames - 1) // self.vae_scale_factor_temporal + 1)
|
||||
# Check inputs. Raise error if not correct
|
||||
self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs)
|
||||
|
||||
self._guidance_scale = guidance_scale
|
||||
self._current_timestep = None
|
||||
@@ -768,137 +706,102 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
vae_dtype = self.vae.dtype
|
||||
transformer_dtype = self.transformer.dtype
|
||||
|
||||
if getattr(self.transformer.config, "img_context_dim_in", None):
|
||||
img_context = torch.zeros(
|
||||
batch_size,
|
||||
self.transformer.config.img_context_num_tokens,
|
||||
self.transformer.config.img_context_dim_in,
|
||||
device=prompt_embeds.device,
|
||||
img_context = torch.zeros(
|
||||
batch_size,
|
||||
self.transformer.config.img_context_num_tokens,
|
||||
self.transformer.config.img_context_dim_in,
|
||||
device=prompt_embeds.device,
|
||||
dtype=transformer_dtype,
|
||||
)
|
||||
encoder_hidden_states = (prompt_embeds, img_context)
|
||||
neg_encoder_hidden_states = (negative_prompt_embeds, img_context)
|
||||
|
||||
num_frames_in = None
|
||||
if image is not None:
|
||||
if batch_size != 1:
|
||||
raise ValueError(f"batch_size must be 1 for image input (given {batch_size})")
|
||||
|
||||
image = torchvision.transforms.functional.to_tensor(image).unsqueeze(0)
|
||||
video = torch.cat([image, torch.zeros_like(image).repeat(num_frames - 1, 1, 1, 1)], dim=0)
|
||||
video = video.unsqueeze(0)
|
||||
num_frames_in = 1
|
||||
elif video is None:
|
||||
video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=torch.uint8)
|
||||
num_frames_in = 0
|
||||
else:
|
||||
num_frames_in = len(video)
|
||||
|
||||
if batch_size != 1:
|
||||
raise ValueError(f"batch_size must be 1 for video input (given {batch_size})")
|
||||
|
||||
assert video is not None
|
||||
video = self.video_processor.preprocess_video(video, height, width)
|
||||
|
||||
# pad with last frame (for video2world)
|
||||
num_frames_out = num_frames
|
||||
video = _maybe_pad_video(video, num_frames_out)
|
||||
assert num_frames_in <= num_frames_out, f"expected ({num_frames_in=}) <= ({num_frames_out=})"
|
||||
|
||||
video = video.to(device=device, dtype=vae_dtype)
|
||||
|
||||
num_channels_latents = self.transformer.config.in_channels - 1
|
||||
latents, cond_latent, cond_mask, cond_indicator = self.prepare_latents(
|
||||
video=video,
|
||||
batch_size=batch_size * num_videos_per_prompt,
|
||||
num_channels_latents=num_channels_latents,
|
||||
height=height,
|
||||
width=width,
|
||||
num_frames_in=num_frames_in,
|
||||
num_frames_out=num_frames,
|
||||
do_classifier_free_guidance=self.do_classifier_free_guidance,
|
||||
dtype=torch.float32,
|
||||
device=device,
|
||||
generator=generator,
|
||||
latents=latents,
|
||||
)
|
||||
cond_timestep = torch.ones_like(cond_indicator) * conditional_frame_timestep
|
||||
cond_mask = cond_mask.to(transformer_dtype)
|
||||
|
||||
controls_latents = None
|
||||
if controls is not None:
|
||||
controls_latents = self._encode_controls(
|
||||
controls,
|
||||
height=height,
|
||||
width=width,
|
||||
num_frames=num_frames,
|
||||
dtype=transformer_dtype,
|
||||
device=device,
|
||||
generator=generator,
|
||||
)
|
||||
|
||||
if num_videos_per_prompt > 1:
|
||||
img_context = img_context.repeat_interleave(num_videos_per_prompt, dim=0)
|
||||
padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
|
||||
|
||||
encoder_hidden_states = (prompt_embeds, img_context)
|
||||
neg_encoder_hidden_states = (negative_prompt_embeds, img_context)
|
||||
else:
|
||||
encoder_hidden_states = prompt_embeds
|
||||
neg_encoder_hidden_states = negative_prompt_embeds
|
||||
# Denoising loop
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
timesteps = self.scheduler.timesteps
|
||||
self._num_timesteps = len(timesteps)
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
|
||||
control_video = self.video_processor.preprocess_video(controls, height, width)
|
||||
if control_video.shape[0] != batch_size:
|
||||
if control_video.shape[0] == 1:
|
||||
control_video = control_video.repeat(batch_size, 1, 1, 1, 1)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Expected controls batch size {batch_size} to match prompt batch size, but got {control_video.shape[0]}."
|
||||
gt_velocity = (latents - cond_latent) * cond_mask
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
self._current_timestep = t.cpu().item()
|
||||
|
||||
# NOTE: assumes sigma(t) \in [0, 1]
|
||||
sigma_t = (
|
||||
torch.tensor(self.scheduler.sigmas[i].item())
|
||||
.unsqueeze(0)
|
||||
.to(device=device, dtype=transformer_dtype)
|
||||
)
|
||||
|
||||
num_frames_out = control_video.shape[2]
|
||||
if num_frames is not None:
|
||||
num_frames_out = min(num_frames_out, num_frames)
|
||||
|
||||
control_video = _maybe_pad_or_trim_video(control_video, num_frames_out)
|
||||
|
||||
# chunk information
|
||||
num_latent_frames_per_chunk = (num_frames_per_chunk - 1) // self.vae_scale_factor_temporal + 1
|
||||
chunk_stride = num_frames_per_chunk - num_ar_conditional_frames
|
||||
chunk_idxs = [
|
||||
(start_idx, min(start_idx + num_frames_per_chunk, num_frames_out))
|
||||
for start_idx in range(0, num_frames_out - num_ar_conditional_frames, chunk_stride)
|
||||
]
|
||||
|
||||
video_chunks = []
|
||||
latents_mean = self.latents_mean.to(dtype=vae_dtype, device=device)
|
||||
latents_std = self.latents_std.to(dtype=vae_dtype, device=device)
|
||||
|
||||
def decode_latents(latents):
|
||||
latents = latents * latents_std + latents_mean
|
||||
video = self.vae.decode(latents.to(dtype=self.vae.dtype, device=device), return_dict=False)[0]
|
||||
return video
|
||||
|
||||
latents_arg = latents
|
||||
initial_num_cond_latent_frames = 0
|
||||
latent_chunks = []
|
||||
num_chunks = len(chunk_idxs)
|
||||
total_steps = num_inference_steps * num_chunks
|
||||
with self.progress_bar(total=total_steps) as progress_bar:
|
||||
for chunk_idx, (start_idx, end_idx) in enumerate(chunk_idxs):
|
||||
if chunk_idx == 0:
|
||||
prev_output = torch.zeros((batch_size, num_frames_per_chunk, 3, height, width), dtype=vae_dtype)
|
||||
prev_output = self.video_processor.preprocess_video(prev_output, height, width)
|
||||
else:
|
||||
prev_output = video_chunks[-1].clone()
|
||||
if num_ar_conditional_frames > 0:
|
||||
prev_output[:, :, :num_ar_conditional_frames] = prev_output[:, :, -num_ar_conditional_frames:]
|
||||
prev_output[:, :, num_ar_conditional_frames:] = -1 # -1 == 0 in processed video space
|
||||
else:
|
||||
prev_output.fill_(-1)
|
||||
|
||||
chunk_video = prev_output.to(device=device, dtype=vae_dtype)
|
||||
chunk_video = _maybe_pad_or_trim_video(chunk_video, num_frames_per_chunk)
|
||||
latents, cond_latent, cond_mask, cond_indicator = self.prepare_latents(
|
||||
video=chunk_video,
|
||||
batch_size=batch_size * num_videos_per_prompt,
|
||||
num_channels_latents=self.transformer.config.in_channels - 1,
|
||||
height=height,
|
||||
width=width,
|
||||
num_frames_in=chunk_video.shape[2],
|
||||
num_frames_out=num_frames_per_chunk,
|
||||
do_classifier_free_guidance=self.do_classifier_free_guidance,
|
||||
dtype=torch.float32,
|
||||
device=device,
|
||||
generator=generator,
|
||||
num_cond_latent_frames=initial_num_cond_latent_frames
|
||||
if chunk_idx == 0
|
||||
else num_cond_latent_frames,
|
||||
latents=latents_arg,
|
||||
)
|
||||
cond_mask = cond_mask.to(transformer_dtype)
|
||||
cond_timestep = torch.ones_like(cond_indicator) * conditional_frame_timestep
|
||||
padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
|
||||
|
||||
chunk_control_video = control_video[:, :, start_idx:end_idx, ...].to(
|
||||
device=device, dtype=self.vae.dtype
|
||||
)
|
||||
chunk_control_video = _maybe_pad_or_trim_video(chunk_control_video, num_frames_per_chunk)
|
||||
if isinstance(generator, list):
|
||||
controls_latents = [
|
||||
retrieve_latents(self.vae.encode(chunk_control_video[i].unsqueeze(0)), generator=generator[i])
|
||||
for i in range(chunk_control_video.shape[0])
|
||||
]
|
||||
else:
|
||||
controls_latents = [
|
||||
retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator=generator)
|
||||
for vid in chunk_control_video
|
||||
]
|
||||
controls_latents = torch.cat(controls_latents, dim=0).to(transformer_dtype)
|
||||
|
||||
controls_latents = (controls_latents - latents_mean) / latents_std
|
||||
|
||||
# Denoising loop
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
timesteps = self.scheduler.timesteps
|
||||
self._num_timesteps = len(timesteps)
|
||||
|
||||
gt_velocity = (latents - cond_latent) * cond_mask
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
self._current_timestep = t.cpu().item()
|
||||
|
||||
# NOTE: assumes sigma(t) \in [0, 1]
|
||||
sigma_t = (
|
||||
torch.tensor(self.scheduler.sigmas[i].item())
|
||||
.unsqueeze(0)
|
||||
.to(device=device, dtype=transformer_dtype)
|
||||
)
|
||||
|
||||
in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
|
||||
in_latents = in_latents.to(transformer_dtype)
|
||||
in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
|
||||
in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
|
||||
in_latents = in_latents.to(transformer_dtype)
|
||||
in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
|
||||
control_blocks = None
|
||||
if controls_latents is not None and self.controlnet is not None:
|
||||
control_output = self.controlnet(
|
||||
controls_latents=controls_latents,
|
||||
latents=in_latents,
|
||||
@@ -911,18 +814,20 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
)
|
||||
control_blocks = control_output[0]
|
||||
|
||||
noise_pred = self.transformer(
|
||||
hidden_states=in_latents,
|
||||
timestep=in_timestep,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
block_controlnet_hidden_states=control_blocks,
|
||||
condition_mask=cond_mask,
|
||||
padding_mask=padding_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
|
||||
noise_pred = self.transformer(
|
||||
hidden_states=in_latents,
|
||||
timestep=in_timestep,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
block_controlnet_hidden_states=control_blocks,
|
||||
condition_mask=cond_mask,
|
||||
padding_mask=padding_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
|
||||
|
||||
if self.do_classifier_free_guidance:
|
||||
if self.do_classifier_free_guidance:
|
||||
control_blocks = None
|
||||
if controls_latents is not None and self.controlnet is not None:
|
||||
control_output = self.controlnet(
|
||||
controls_latents=controls_latents,
|
||||
latents=in_latents,
|
||||
@@ -935,50 +840,46 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
)
|
||||
control_blocks = control_output[0]
|
||||
|
||||
noise_pred_neg = self.transformer(
|
||||
hidden_states=in_latents,
|
||||
timestep=in_timestep,
|
||||
encoder_hidden_states=neg_encoder_hidden_states, # NOTE: negative prompt
|
||||
block_controlnet_hidden_states=control_blocks,
|
||||
condition_mask=cond_mask,
|
||||
padding_mask=padding_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
# NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
|
||||
noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
|
||||
noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
|
||||
noise_pred_neg = self.transformer(
|
||||
hidden_states=in_latents,
|
||||
timestep=in_timestep,
|
||||
encoder_hidden_states=neg_encoder_hidden_states, # NOTE: negative prompt
|
||||
block_controlnet_hidden_states=control_blocks,
|
||||
condition_mask=cond_mask,
|
||||
padding_mask=padding_mask,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
# NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
|
||||
noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
|
||||
noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
|
||||
|
||||
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
||||
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
||||
|
||||
# call the callback, if provided
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
|
||||
negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
|
||||
|
||||
if i == total_steps - 1 or ((i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
# call the callback, if provided
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
video_chunks.append(decode_latents(latents).detach().cpu())
|
||||
latent_chunks.append(latents.detach().cpu())
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
self._current_timestep = None
|
||||
|
||||
if not output_type == "latent":
|
||||
video_chunks = [
|
||||
chunk[:, :, num_ar_conditional_frames:, ...] if chunk_idx != 0 else chunk
|
||||
for chunk_idx, chunk in enumerate(video_chunks)
|
||||
]
|
||||
video = torch.cat(video_chunks, dim=2)
|
||||
video = video[:, :, :num_frames_out, ...]
|
||||
latents_mean = self.latents_mean.to(latents.device, latents.dtype)
|
||||
latents_std = self.latents_std.to(latents.device, latents.dtype)
|
||||
latents = latents * latents_std + latents_mean
|
||||
video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
|
||||
video = self._match_num_frames(video, num_frames)
|
||||
|
||||
assert self.safety_checker is not None
|
||||
self.safety_checker.to(device)
|
||||
@@ -995,13 +896,7 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
video = torch.from_numpy(video).permute(0, 4, 1, 2, 3)
|
||||
video = self.video_processor.postprocess_video(video, output_type=output_type)
|
||||
else:
|
||||
latent_T = (num_frames_out - 1) // self.vae_scale_factor_temporal + 1
|
||||
latent_chunks = [
|
||||
chunk[:, :, num_cond_latent_frames:, ...] if chunk_idx != 0 else chunk
|
||||
for chunk_idx, chunk in enumerate(latent_chunks)
|
||||
]
|
||||
video = torch.cat(latent_chunks, dim=2)
|
||||
video = video[:, :, :latent_T, ...]
|
||||
video = latents
|
||||
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
@@ -1010,3 +905,19 @@ class Cosmos2_5_TransferPipeline(DiffusionPipeline):
|
||||
return (video,)
|
||||
|
||||
return CosmosPipelineOutput(frames=video)
|
||||
|
||||
def _match_num_frames(self, video: torch.Tensor, target_num_frames: int) -> torch.Tensor:
|
||||
if target_num_frames <= 0 or video.shape[2] == target_num_frames:
|
||||
return video
|
||||
|
||||
frames_per_latent = max(self.vae_scale_factor_temporal, 1)
|
||||
video = torch.repeat_interleave(video, repeats=frames_per_latent, dim=2)
|
||||
|
||||
current_frames = video.shape[2]
|
||||
if current_frames < target_num_frames:
|
||||
pad = video[:, :, -1:, :, :].repeat(1, 1, target_num_frames - current_frames, 1, 1)
|
||||
video = torch.cat([video, pad], dim=2)
|
||||
elif current_frames > target_num_frames:
|
||||
video = video[:, :, :target_num_frames]
|
||||
|
||||
return video
|
||||
|
||||
@@ -20,8 +20,6 @@ class MultilingualCLIP(PreTrainedModel):
|
||||
self.LinearTransformation = torch.nn.Linear(
|
||||
in_features=config.transformerDimensions, out_features=config.numDims
|
||||
)
|
||||
if hasattr(self, "post_init"):
|
||||
self.post_init()
|
||||
|
||||
def forward(self, input_ids, attention_mask):
|
||||
embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
|
||||
|
||||
@@ -781,9 +781,6 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
||||
self.prefix_encoder = PrefixEncoder(config)
|
||||
self.dropout = torch.nn.Dropout(0.1)
|
||||
|
||||
if hasattr(self, "post_init"):
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embedding.word_embeddings
|
||||
|
||||
@@ -813,7 +810,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else getattr(self.config, "use_cache", None)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
batch_size, seq_length = input_ids.shape
|
||||
|
||||
@@ -699,13 +699,9 @@ class LTX2ImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraL
|
||||
mask_shape = (batch_size, 1, num_frames, height, width)
|
||||
|
||||
if latents is not None:
|
||||
conditioning_mask = latents.new_zeros(mask_shape)
|
||||
conditioning_mask[:, :, 0] = 1.0
|
||||
if latents.ndim == 5:
|
||||
# conditioning_mask needs to the same shape as latents in two stages generation.
|
||||
batch_size, _, num_frames, height, width = latents.shape
|
||||
mask_shape = (batch_size, 1, num_frames, height, width)
|
||||
conditioning_mask = latents.new_zeros(mask_shape)
|
||||
conditioning_mask[:, :, 0] = 1.0
|
||||
|
||||
latents = self._normalize_latents(
|
||||
latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
|
||||
)
|
||||
@@ -714,9 +710,6 @@ class LTX2ImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraL
|
||||
latents = self._pack_latents(
|
||||
latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
|
||||
)
|
||||
else:
|
||||
conditioning_mask = latents.new_zeros(mask_shape)
|
||||
conditioning_mask[:, :, 0] = 1.0
|
||||
conditioning_mask = self._pack_latents(
|
||||
conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
|
||||
).squeeze(-1)
|
||||
|
||||
@@ -341,7 +341,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
|
||||
save_method_accept_variant = "variant" in save_method_signature.parameters
|
||||
save_method_accept_max_shard_size = "max_shard_size" in save_method_signature.parameters
|
||||
save_method_accept_peft_format = "save_peft_format" in save_method_signature.parameters
|
||||
|
||||
save_kwargs = {}
|
||||
if save_method_accept_safe:
|
||||
@@ -351,11 +350,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
||||
if save_method_accept_max_shard_size and max_shard_size is not None:
|
||||
# max_shard_size is expected to not be None in ModelMixin
|
||||
save_kwargs["max_shard_size"] = max_shard_size
|
||||
if save_method_accept_peft_format:
|
||||
# Set save_peft_format=False for transformers>=5.0.0 compatibility
|
||||
# In transformers 5.0.0+, the default save_peft_format=True adds "base_model.model" prefix
|
||||
# to adapter keys, but from_pretrained expects keys without this prefix
|
||||
save_kwargs["save_peft_format"] = False
|
||||
|
||||
save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
|
||||
|
||||
|
||||
@@ -24,25 +24,14 @@ except OptionalDependencyNotAvailable:
|
||||
else:
|
||||
_import_structure["pipeline_prx"] = ["PRXPipeline"]
|
||||
|
||||
# Wrap T5GemmaEncoder to pass config.encoder (T5GemmaModuleConfig) instead of the
|
||||
# composite T5GemmaConfig, which lacks flat attributes expected by T5GemmaEncoder.__init__.
|
||||
# Import T5GemmaEncoder for pipeline loading compatibility
|
||||
try:
|
||||
if is_transformers_available():
|
||||
import transformers
|
||||
from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder as _T5GemmaEncoder
|
||||
|
||||
class T5GemmaEncoder(_T5GemmaEncoder):
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
|
||||
if "config" not in kwargs:
|
||||
from transformers.models.t5gemma.configuration_t5gemma import T5GemmaConfig
|
||||
|
||||
config = T5GemmaConfig.from_pretrained(pretrained_model_name_or_path)
|
||||
if hasattr(config, "encoder"):
|
||||
kwargs["config"] = config.encoder
|
||||
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
|
||||
from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
|
||||
|
||||
_additional_imports["T5GemmaEncoder"] = T5GemmaEncoder
|
||||
# Patch transformers module directly for serialization
|
||||
if not hasattr(transformers, "T5GemmaEncoder"):
|
||||
transformers.T5GemmaEncoder = T5GemmaEncoder
|
||||
except ImportError:
|
||||
|
||||
@@ -18,6 +18,7 @@ import re
|
||||
import urllib.parse as ul
|
||||
from typing import Callable
|
||||
|
||||
import ftfy
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
@@ -33,13 +34,13 @@ from diffusers.models.transformers.transformer_prx import PRXTransformer2DModel
|
||||
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.prx.pipeline_output import PRXPipelineOutput
|
||||
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
||||
from diffusers.utils import is_ftfy_available, logging, replace_example_docstring
|
||||
from diffusers.utils import (
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
|
||||
if is_ftfy_available():
|
||||
import ftfy
|
||||
|
||||
DEFAULT_RESOLUTION = 512
|
||||
|
||||
ASPECT_RATIO_256_BIN = {
|
||||
|
||||
@@ -17,7 +17,7 @@ from typing import Any, Callable
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...loaders import SkyReelsV2LoraLoaderMixin
|
||||
@@ -132,7 +132,7 @@ class SkyReelsV2Pipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixin):
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -19,7 +19,7 @@ from copy import deepcopy
|
||||
from typing import Any, Callable
|
||||
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...loaders import SkyReelsV2LoraLoaderMixin
|
||||
@@ -153,7 +153,7 @@ class SkyReelsV2DiffusionForcingPipeline(DiffusionPipeline, SkyReelsV2LoraLoader
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -20,7 +20,7 @@ from typing import Any, Callable
|
||||
|
||||
import PIL
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
|
||||
from diffusers.image_processor import PipelineImageInput
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
@@ -158,7 +158,7 @@ class SkyReelsV2DiffusionForcingImageToVideoPipeline(DiffusionPipeline, SkyReels
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -21,7 +21,7 @@ from typing import Any, Callable
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoTokenizer, T5EncoderModel, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...loaders import SkyReelsV2LoraLoaderMixin
|
||||
@@ -214,7 +214,7 @@ class SkyReelsV2DiffusionForcingVideoToVideoPipeline(DiffusionPipeline, SkyReels
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
vae: AutoencoderKLWan,
|
||||
scheduler: UniPCMultistepScheduler,
|
||||
|
||||
@@ -18,7 +18,7 @@ from typing import Any, Callable
|
||||
import PIL
|
||||
import regex as re
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPProcessor, CLIPVisionModelWithProjection, T5EncoderModel, UMT5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPProcessor, CLIPVisionModelWithProjection, UMT5EncoderModel
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput
|
||||
@@ -157,7 +157,7 @@ class SkyReelsV2ImageToVideoPipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixi
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: AutoTokenizer,
|
||||
text_encoder: T5EncoderModel | UMT5EncoderModel,
|
||||
text_encoder: UMT5EncoderModel,
|
||||
image_encoder: CLIPVisionModelWithProjection,
|
||||
image_processor: CLIPProcessor,
|
||||
transformer: SkyReelsV2Transformer3DModel,
|
||||
|
||||
@@ -112,8 +112,6 @@ def _load_transformers_model_from_dduf(
|
||||
tensors = safetensors.torch.load(mmap)
|
||||
# Update the state dictionary with tensors
|
||||
state_dict.update(tensors)
|
||||
# `from_pretrained` sets the model to eval mode by default, which is the
|
||||
# correct behavior for inference. Do not call `model.train()` here.
|
||||
return cls.from_pretrained(
|
||||
pretrained_model_name_or_path=None,
|
||||
config=config,
|
||||
|
||||
@@ -276,7 +276,7 @@ class ZImagePipeline(DiffusionPipeline, ZImageLoraLoaderMixin, FromSingleFileMix
|
||||
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
return self._guidance_scale > 0
|
||||
return self._guidance_scale > 1
|
||||
|
||||
@property
|
||||
def joint_attention_kwargs(self):
|
||||
|
||||
@@ -516,9 +516,6 @@ def dequantize_gguf_tensor(tensor):
|
||||
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
|
||||
# Conver to plain tensor to avoid unnecessary __torch_function__ overhead.
|
||||
tensor = tensor.as_tensor()
|
||||
|
||||
tensor = tensor.view(torch.uint8)
|
||||
shape = _quant_shape_from_byte_shape(tensor.shape, type_size, block_size)
|
||||
|
||||
@@ -528,7 +525,7 @@ def dequantize_gguf_tensor(tensor):
|
||||
dequant = dequant_fn(blocks, block_size, type_size)
|
||||
dequant = dequant.reshape(shape)
|
||||
|
||||
return dequant
|
||||
return dequant.as_tensor()
|
||||
|
||||
|
||||
class GGUFParameter(torch.nn.Parameter):
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -42,7 +41,7 @@ class FlowMatchLCMSchedulerOutput(BaseOutput):
|
||||
denoising loop.
|
||||
"""
|
||||
|
||||
prev_sample: torch.Tensor
|
||||
prev_sample: torch.FloatTensor
|
||||
|
||||
|
||||
class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
@@ -80,11 +79,11 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
use_beta_sigmas (`bool`, defaults to False):
|
||||
Whether to use beta sigmas for step sizes in the noise schedule during sampling.
|
||||
time_shift_type (`str`, defaults to "exponential"):
|
||||
The type of dynamic resolution-dependent timestep shifting to apply.
|
||||
scale_factors (`list[float]`, *optional*, defaults to `None`):
|
||||
The type of dynamic resolution-dependent timestep shifting to apply. Either "exponential" or "linear".
|
||||
scale_factors ('list', defaults to None)
|
||||
It defines how to scale the latents at which predictions are made.
|
||||
upscale_mode (`str`, *optional*, defaults to "bicubic"):
|
||||
Upscaling method, applied if scale-wise generation is considered.
|
||||
upscale_mode ('str', defaults to 'bicubic')
|
||||
Upscaling method, applied if scale-wise generation is considered
|
||||
"""
|
||||
|
||||
_compatibles = []
|
||||
@@ -102,33 +101,16 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
max_image_seq_len: int = 4096,
|
||||
invert_sigmas: bool = False,
|
||||
shift_terminal: float | None = None,
|
||||
use_karras_sigmas: bool | None = False,
|
||||
use_exponential_sigmas: bool | None = False,
|
||||
use_beta_sigmas: bool | None = False,
|
||||
time_shift_type: Literal["exponential", "linear"] = "exponential",
|
||||
use_karras_sigmas: bool = False,
|
||||
use_exponential_sigmas: bool = False,
|
||||
use_beta_sigmas: bool = False,
|
||||
time_shift_type: str = "exponential",
|
||||
scale_factors: list[float] | None = None,
|
||||
upscale_mode: Literal[
|
||||
"nearest",
|
||||
"linear",
|
||||
"bilinear",
|
||||
"bicubic",
|
||||
"trilinear",
|
||||
"area",
|
||||
"nearest-exact",
|
||||
] = "bicubic",
|
||||
upscale_mode: str = "bicubic",
|
||||
):
|
||||
if self.config.use_beta_sigmas and not is_scipy_available():
|
||||
raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
|
||||
if (
|
||||
sum(
|
||||
[
|
||||
self.config.use_beta_sigmas,
|
||||
self.config.use_exponential_sigmas,
|
||||
self.config.use_karras_sigmas,
|
||||
]
|
||||
)
|
||||
> 1
|
||||
):
|
||||
if sum([self.config.use_beta_sigmas, self.config.use_exponential_sigmas, self.config.use_karras_sigmas]) > 1:
|
||||
raise ValueError(
|
||||
"Only one of `config.use_beta_sigmas`, `config.use_exponential_sigmas`, `config.use_karras_sigmas` can be used."
|
||||
)
|
||||
@@ -180,7 +162,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self._begin_index
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
|
||||
def set_begin_index(self, begin_index: int = 0) -> None:
|
||||
def set_begin_index(self, begin_index: int = 0):
|
||||
"""
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
@@ -190,18 +172,18 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
|
||||
def set_shift(self, shift: float) -> None:
|
||||
def set_shift(self, shift: float):
|
||||
self._shift = shift
|
||||
|
||||
def set_scale_factors(self, scale_factors: list[float], upscale_mode: str) -> None:
|
||||
def set_scale_factors(self, scale_factors: list, upscale_mode):
|
||||
"""
|
||||
Sets scale factors for a scale-wise generation regime.
|
||||
|
||||
Args:
|
||||
scale_factors (`list[float]`):
|
||||
The scale factors for each step.
|
||||
scale_factors (`list`):
|
||||
The scale factors for each step
|
||||
upscale_mode (`str`):
|
||||
Upscaling method.
|
||||
Upscaling method
|
||||
"""
|
||||
self._scale_factors = scale_factors
|
||||
self._upscale_mode = upscale_mode
|
||||
@@ -256,18 +238,16 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return sample
|
||||
|
||||
def _sigma_to_t(self, sigma: float | torch.FloatTensor) -> float | torch.FloatTensor:
|
||||
def _sigma_to_t(self, sigma):
|
||||
return sigma * self.config.num_train_timesteps
|
||||
|
||||
def time_shift(
|
||||
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
|
||||
) -> float | np.ndarray | torch.Tensor:
|
||||
def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
|
||||
if self.config.time_shift_type == "exponential":
|
||||
return self._time_shift_exponential(mu, sigma, t)
|
||||
elif self.config.time_shift_type == "linear":
|
||||
return self._time_shift_linear(mu, sigma, t)
|
||||
|
||||
def stretch_shift_to_terminal(self, t: np.ndarray | torch.Tensor) -> np.ndarray | torch.Tensor:
|
||||
def stretch_shift_to_terminal(self, t: torch.Tensor) -> torch.Tensor:
|
||||
r"""
|
||||
Stretches and shifts the timestep schedule to ensure it terminates at the configured `shift_terminal` config
|
||||
value.
|
||||
@@ -276,13 +256,12 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
https://github.com/Lightricks/LTX-Video/blob/a01a171f8fe3d99dce2728d60a73fecf4d4238ae/ltx_video/schedulers/rf.py#L51
|
||||
|
||||
Args:
|
||||
t (`torch.Tensor` or `np.ndarray`):
|
||||
A tensor or numpy array of timesteps to be stretched and shifted.
|
||||
t (`torch.Tensor`):
|
||||
A tensor of timesteps to be stretched and shifted.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor` or `np.ndarray`:
|
||||
A tensor or numpy array of adjusted timesteps such that the final value equals
|
||||
`self.config.shift_terminal`.
|
||||
`torch.Tensor`:
|
||||
A tensor of adjusted timesteps such that the final value equals `self.config.shift_terminal`.
|
||||
"""
|
||||
one_minus_z = 1 - t
|
||||
scale_factor = one_minus_z[-1] / (1 - self.config.shift_terminal)
|
||||
@@ -291,12 +270,12 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: int | None = None,
|
||||
device: str | torch.device | None = None,
|
||||
num_inference_steps: int = None,
|
||||
device: str | torch.device = None,
|
||||
sigmas: list[float] | None = None,
|
||||
mu: float | None = None,
|
||||
mu: float = None,
|
||||
timesteps: list[float] | None = None,
|
||||
) -> None:
|
||||
):
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
@@ -338,45 +317,43 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
is_timesteps_provided = timesteps is not None
|
||||
|
||||
if is_timesteps_provided:
|
||||
timesteps = np.array(timesteps).astype(np.float32) # type: ignore
|
||||
timesteps = np.array(timesteps).astype(np.float32)
|
||||
|
||||
if sigmas is None:
|
||||
if timesteps is None:
|
||||
timesteps = np.linspace( # type: ignore
|
||||
self._sigma_to_t(self.sigma_max),
|
||||
self._sigma_to_t(self.sigma_min),
|
||||
num_inference_steps,
|
||||
timesteps = np.linspace(
|
||||
self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
|
||||
)
|
||||
sigmas = timesteps / self.config.num_train_timesteps # type: ignore
|
||||
sigmas = timesteps / self.config.num_train_timesteps
|
||||
else:
|
||||
sigmas = np.array(sigmas).astype(np.float32) # type: ignore
|
||||
sigmas = np.array(sigmas).astype(np.float32)
|
||||
num_inference_steps = len(sigmas)
|
||||
|
||||
# 2. Perform timestep shifting. Either no shifting is applied, or resolution-dependent shifting of
|
||||
# "exponential" or "linear" type is applied
|
||||
if self.config.use_dynamic_shifting:
|
||||
sigmas = self.time_shift(mu, 1.0, sigmas) # type: ignore
|
||||
sigmas = self.time_shift(mu, 1.0, sigmas)
|
||||
else:
|
||||
sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas) # type: ignore
|
||||
sigmas = self.shift * sigmas / (1 + (self.shift - 1) * sigmas)
|
||||
|
||||
# 3. If required, stretch the sigmas schedule to terminate at the configured `shift_terminal` value
|
||||
if self.config.shift_terminal:
|
||||
sigmas = self.stretch_shift_to_terminal(sigmas) # type: ignore
|
||||
sigmas = self.stretch_shift_to_terminal(sigmas)
|
||||
|
||||
# 4. If required, convert sigmas to one of karras, exponential, or beta sigma schedules
|
||||
if self.config.use_karras_sigmas:
|
||||
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
|
||||
sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
|
||||
elif self.config.use_exponential_sigmas:
|
||||
sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
|
||||
sigmas = self._convert_to_exponential(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
|
||||
elif self.config.use_beta_sigmas:
|
||||
sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps) # type: ignore
|
||||
sigmas = self._convert_to_beta(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
|
||||
|
||||
# 5. Convert sigmas and timesteps to tensors and move to specified device
|
||||
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) # type: ignore
|
||||
sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
|
||||
if not is_timesteps_provided:
|
||||
timesteps = sigmas * self.config.num_train_timesteps # type: ignore
|
||||
timesteps = sigmas * self.config.num_train_timesteps
|
||||
else:
|
||||
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device) # type: ignore
|
||||
timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device)
|
||||
|
||||
# 6. Append the terminal sigma value.
|
||||
# If a model requires inverted sigma schedule for denoising but timesteps without inversion, the
|
||||
@@ -393,11 +370,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._step_index = None
|
||||
self._begin_index = None
|
||||
|
||||
def index_for_timestep(
|
||||
self,
|
||||
timestep: float | torch.Tensor,
|
||||
schedule_timesteps: torch.Tensor | None = None,
|
||||
) -> int:
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -409,9 +382,9 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
# case we start in the middle of the denoising schedule (e.g. for image-to-image)
|
||||
pos = 1 if len(indices) > 1 else 0
|
||||
|
||||
return int(indices[pos].item())
|
||||
return indices[pos].item()
|
||||
|
||||
def _init_step_index(self, timestep: float | torch.Tensor) -> None:
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -486,12 +459,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
size = [round(self._scale_factors[self._step_index] * size) for size in self._init_size]
|
||||
x0_pred = torch.nn.functional.interpolate(x0_pred, size=size, mode=self._upscale_mode)
|
||||
|
||||
noise = randn_tensor(
|
||||
x0_pred.shape,
|
||||
generator=generator,
|
||||
device=x0_pred.device,
|
||||
dtype=x0_pred.dtype,
|
||||
)
|
||||
noise = randn_tensor(x0_pred.shape, generator=generator, device=x0_pred.device, dtype=x0_pred.dtype)
|
||||
prev_sample = (1 - sigma_next) * x0_pred + sigma_next * noise
|
||||
|
||||
# upon completion increase step index by one
|
||||
@@ -505,7 +473,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return FlowMatchLCMSchedulerOutput(prev_sample=prev_sample)
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
@@ -626,15 +594,11 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
)
|
||||
return sigmas
|
||||
|
||||
def _time_shift_exponential(
|
||||
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
|
||||
) -> float | np.ndarray | torch.Tensor:
|
||||
def _time_shift_exponential(self, mu, sigma, t):
|
||||
return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
|
||||
|
||||
def _time_shift_linear(
|
||||
self, mu: float, sigma: float, t: float | np.ndarray | torch.Tensor
|
||||
) -> float | np.ndarray | torch.Tensor:
|
||||
def _time_shift_linear(self, mu, sigma, t):
|
||||
return mu / (mu + (1 / t - 1) ** sigma)
|
||||
|
||||
def __len__(self) -> int:
|
||||
def __len__(self):
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
@@ -31,18 +31,14 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
num_train_timesteps (`int`, defaults to 1000):
|
||||
The number of diffusion steps to train the model.
|
||||
trained_betas (`np.ndarray` or `List[float]`, *optional*):
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
||||
"""
|
||||
|
||||
order = 1
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
num_train_timesteps: int = 1000,
|
||||
trained_betas: np.ndarray | list[float] | None = None,
|
||||
):
|
||||
def __init__(self, num_train_timesteps: int = 1000, trained_betas: np.ndarray | list[float] | None = None):
|
||||
# set `betas`, `alphas`, `timesteps`
|
||||
self.set_timesteps(num_train_timesteps)
|
||||
|
||||
@@ -60,29 +56,21 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
@property
|
||||
def step_index(self) -> int | None:
|
||||
def step_index(self):
|
||||
"""
|
||||
The index counter for current timestep. It will increase 1 after each scheduler step.
|
||||
|
||||
Returns:
|
||||
`int` or `None`:
|
||||
The index counter for current timestep.
|
||||
"""
|
||||
return self._step_index
|
||||
|
||||
@property
|
||||
def begin_index(self) -> int | None:
|
||||
def begin_index(self):
|
||||
"""
|
||||
The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
|
||||
|
||||
Returns:
|
||||
`int` or `None`:
|
||||
The index for the first timestep.
|
||||
"""
|
||||
return self._begin_index
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
|
||||
def set_begin_index(self, begin_index: int = 0) -> None:
|
||||
def set_begin_index(self, begin_index: int = 0):
|
||||
"""
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
@@ -181,7 +169,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`int` or `torch.Tensor`):
|
||||
timestep (`int`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.Tensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
@@ -240,30 +228,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
return sample
|
||||
|
||||
def _get_prev_sample(
|
||||
self,
|
||||
sample: torch.Tensor,
|
||||
timestep_index: int,
|
||||
prev_timestep_index: int,
|
||||
ets: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Predicts the previous sample based on the current sample, timestep indices, and running model outputs.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The current sample.
|
||||
timestep_index (`int`):
|
||||
Index of the current timestep in the schedule.
|
||||
prev_timestep_index (`int`):
|
||||
Index of the previous timestep in the schedule.
|
||||
ets (`torch.Tensor`):
|
||||
The running sequence of model outputs.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The predicted previous sample.
|
||||
"""
|
||||
def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
|
||||
alpha = self.alphas[timestep_index]
|
||||
sigma = self.betas[timestep_index]
|
||||
|
||||
@@ -275,5 +240,5 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return prev_sample
|
||||
|
||||
def __len__(self) -> int:
|
||||
def __len__(self):
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
@@ -299,10 +299,7 @@ def get_cached_module_file(
|
||||
# Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
|
||||
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||
|
||||
if subfolder is not None:
|
||||
module_file_or_url = os.path.join(pretrained_model_name_or_path, subfolder, module_file)
|
||||
else:
|
||||
module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
|
||||
module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
|
||||
|
||||
if os.path.isfile(module_file_or_url):
|
||||
resolved_module_file = module_file_or_url
|
||||
@@ -387,11 +384,7 @@ def get_cached_module_file(
|
||||
if not os.path.exists(submodule_path / module_folder):
|
||||
os.makedirs(submodule_path / module_folder)
|
||||
module_needed = f"{module_needed}.py"
|
||||
if subfolder is not None:
|
||||
source_path = os.path.join(pretrained_model_name_or_path, subfolder, module_needed)
|
||||
else:
|
||||
source_path = os.path.join(pretrained_model_name_or_path, module_needed)
|
||||
shutil.copyfile(source_path, submodule_path / module_needed)
|
||||
shutil.copyfile(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
|
||||
else:
|
||||
# Get the commit hash
|
||||
# TODO: we will get this info in the etag soon, so retrieve it from there and not here.
|
||||
|
||||
@@ -107,7 +107,6 @@ def load_or_create_model_card(
|
||||
widget: list[dict] | None = None,
|
||||
inference: bool | None = None,
|
||||
is_modular: bool = False,
|
||||
update_model_card: bool = False,
|
||||
) -> ModelCard:
|
||||
"""
|
||||
Loads or creates a model card.
|
||||
@@ -134,9 +133,6 @@ def load_or_create_model_card(
|
||||
`load_or_create_model_card` from a training script.
|
||||
is_modular: (`bool`, optional): Boolean flag to denote if the model card is for a modular pipeline.
|
||||
When True, uses model_description as-is without additional template formatting.
|
||||
update_model_card: (`bool`, optional): When True, regenerates the model card content even if one
|
||||
already exists on the remote repo. Existing card metadata (tags, license, etc.) is preserved. Only
|
||||
supported for modular pipelines (i.e., `is_modular=True`).
|
||||
"""
|
||||
if not is_jinja_available():
|
||||
raise ValueError(
|
||||
@@ -145,17 +141,9 @@ def load_or_create_model_card(
|
||||
" To install it, please run `pip install Jinja2`."
|
||||
)
|
||||
|
||||
if update_model_card and not is_modular:
|
||||
raise ValueError("`update_model_card=True` is only supported for modular pipelines (`is_modular=True`).")
|
||||
|
||||
try:
|
||||
# Check if the model card is present on the remote repo
|
||||
model_card = ModelCard.load(repo_id_or_path, token=token)
|
||||
# For modular pipelines, regenerate card content when requested (preserve existing metadata)
|
||||
if update_model_card and is_modular and model_description is not None:
|
||||
existing_data = model_card.data
|
||||
model_card = ModelCard(model_description)
|
||||
model_card.data = existing_data
|
||||
except (EntryNotFoundError, RepositoryNotFoundError):
|
||||
# Otherwise create a model card from template
|
||||
if from_training:
|
||||
|
||||
@@ -131,26 +131,6 @@ class CosmosControlNetModelTests(ModelTesterMixin, unittest.TestCase):
|
||||
self.assertIsInstance(output[0], list)
|
||||
self.assertEqual(len(output[0]), init_dict["n_controlnet_blocks"])
|
||||
|
||||
def test_condition_mask_changes_output(self):
|
||||
"""Test that condition mask affects control outputs."""
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
model = self.model_class(**init_dict)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs_no_mask = dict(inputs_dict)
|
||||
inputs_no_mask["condition_mask"] = torch.zeros_like(inputs_dict["condition_mask"])
|
||||
|
||||
with torch.no_grad():
|
||||
output_no_mask = model(**inputs_no_mask)
|
||||
output_with_mask = model(**inputs_dict)
|
||||
|
||||
self.assertEqual(len(output_no_mask.control_block_samples), len(output_with_mask.control_block_samples))
|
||||
for no_mask_tensor, with_mask_tensor in zip(
|
||||
output_no_mask.control_block_samples, output_with_mask.control_block_samples
|
||||
):
|
||||
self.assertFalse(torch.allclose(no_mask_tensor, with_mask_tensor))
|
||||
|
||||
def test_conditioning_scale_single(self):
|
||||
"""Test that a single conditioning scale is broadcast to all blocks."""
|
||||
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
|
||||
|
||||
@@ -1,15 +1,9 @@
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
from transformers import CLIPTextModel, LongformerModel
|
||||
|
||||
from diffusers import ConfigMixin
|
||||
from diffusers.models import AutoModel, UNet2DConditionModel
|
||||
from diffusers.models.modeling_utils import ModelMixin
|
||||
|
||||
|
||||
class TestAutoModel(unittest.TestCase):
|
||||
@@ -26,9 +20,7 @@ class TestAutoModel(unittest.TestCase):
|
||||
side_effect=[EnvironmentError("File not found"), {"model_type": "clip_text_model"}],
|
||||
)
|
||||
def test_load_from_config_transformers_with_subfolder(self, mock_load_config):
|
||||
model = AutoModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder", use_safetensors=False
|
||||
)
|
||||
model = AutoModel.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder")
|
||||
assert isinstance(model, CLIPTextModel)
|
||||
|
||||
def test_load_from_config_without_subfolder(self):
|
||||
@@ -36,160 +28,5 @@ class TestAutoModel(unittest.TestCase):
|
||||
assert isinstance(model, LongformerModel)
|
||||
|
||||
def test_load_from_model_index(self):
|
||||
model = AutoModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder", use_safetensors=False
|
||||
)
|
||||
model = AutoModel.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="text_encoder")
|
||||
assert isinstance(model, CLIPTextModel)
|
||||
|
||||
def test_load_dynamic_module_from_local_path_with_subfolder(self):
|
||||
CUSTOM_MODEL_CODE = (
|
||||
"import torch\n"
|
||||
"from diffusers import ModelMixin, ConfigMixin\n"
|
||||
"from diffusers.configuration_utils import register_to_config\n"
|
||||
"\n"
|
||||
"class CustomModel(ModelMixin, ConfigMixin):\n"
|
||||
" @register_to_config\n"
|
||||
" def __init__(self, hidden_size=8):\n"
|
||||
" super().__init__()\n"
|
||||
" self.linear = torch.nn.Linear(hidden_size, hidden_size)\n"
|
||||
"\n"
|
||||
" def forward(self, x):\n"
|
||||
" return self.linear(x)\n"
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
subfolder = "custom_model"
|
||||
model_dir = os.path.join(tmpdir, subfolder)
|
||||
os.makedirs(model_dir)
|
||||
|
||||
with open(os.path.join(model_dir, "modeling.py"), "w") as f:
|
||||
f.write(CUSTOM_MODEL_CODE)
|
||||
|
||||
config = {
|
||||
"_class_name": "CustomModel",
|
||||
"_diffusers_version": "0.0.0",
|
||||
"auto_map": {"AutoModel": "modeling.CustomModel"},
|
||||
"hidden_size": 8,
|
||||
}
|
||||
with open(os.path.join(model_dir, "config.json"), "w") as f:
|
||||
json.dump(config, f)
|
||||
|
||||
torch.save({}, os.path.join(model_dir, "diffusion_pytorch_model.bin"))
|
||||
|
||||
model = AutoModel.from_pretrained(tmpdir, subfolder=subfolder, trust_remote_code=True)
|
||||
assert model.__class__.__name__ == "CustomModel"
|
||||
assert model.config["hidden_size"] == 8
|
||||
|
||||
|
||||
class TestAutoModelFromConfig(unittest.TestCase):
|
||||
@patch(
|
||||
"diffusers.pipelines.pipeline_loading_utils.get_class_obj_and_candidates",
|
||||
return_value=(MagicMock(), None),
|
||||
)
|
||||
def test_from_config_with_dict_diffusers_class(self, mock_get_class):
|
||||
config = {"_class_name": "UNet2DConditionModel", "sample_size": 64}
|
||||
mock_model = MagicMock()
|
||||
mock_get_class.return_value[0].from_config.return_value = mock_model
|
||||
|
||||
result = AutoModel.from_config(config)
|
||||
|
||||
mock_get_class.assert_called_once_with(
|
||||
library_name="diffusers",
|
||||
class_name="UNet2DConditionModel",
|
||||
importable_classes=unittest.mock.ANY,
|
||||
pipelines=None,
|
||||
is_pipeline_module=False,
|
||||
)
|
||||
mock_get_class.return_value[0].from_config.assert_called_once_with(config)
|
||||
assert result is mock_model
|
||||
|
||||
@patch(
|
||||
"diffusers.pipelines.pipeline_loading_utils.get_class_obj_and_candidates",
|
||||
return_value=(MagicMock(), None),
|
||||
)
|
||||
@patch("diffusers.models.AutoModel.load_config", return_value={"_class_name": "UNet2DConditionModel"})
|
||||
def test_from_config_with_string_path(self, mock_load_config, mock_get_class):
|
||||
mock_model = MagicMock()
|
||||
mock_get_class.return_value[0].from_config.return_value = mock_model
|
||||
|
||||
result = AutoModel.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet")
|
||||
|
||||
mock_load_config.assert_called_once()
|
||||
assert result is mock_model
|
||||
|
||||
def test_from_config_raises_on_missing_class_info(self):
|
||||
config = {"some_key": "some_value"}
|
||||
with self.assertRaises(ValueError, msg="Couldn't find a model class"):
|
||||
AutoModel.from_config(config)
|
||||
|
||||
@patch(
|
||||
"diffusers.pipelines.pipeline_loading_utils.get_class_obj_and_candidates",
|
||||
return_value=(MagicMock(), None),
|
||||
)
|
||||
def test_from_config_with_model_type_routes_to_transformers(self, mock_get_class):
|
||||
config = {"model_type": "clip_text_model"}
|
||||
mock_model = MagicMock()
|
||||
mock_get_class.return_value[0].from_config.return_value = mock_model
|
||||
|
||||
result = AutoModel.from_config(config)
|
||||
|
||||
mock_get_class.assert_called_once_with(
|
||||
library_name="transformers",
|
||||
class_name="AutoModel",
|
||||
importable_classes=unittest.mock.ANY,
|
||||
pipelines=None,
|
||||
is_pipeline_module=False,
|
||||
)
|
||||
assert result is mock_model
|
||||
|
||||
def test_from_config_raises_on_none(self):
|
||||
with self.assertRaises(ValueError, msg="Please provide a `pretrained_model_name_or_path_or_dict`"):
|
||||
AutoModel.from_config(None)
|
||||
|
||||
|
||||
class TestRegisterForAutoClass(unittest.TestCase):
|
||||
def test_register_for_auto_class_sets_attribute(self):
|
||||
class DummyModel(ModelMixin, ConfigMixin):
|
||||
config_name = "config.json"
|
||||
|
||||
DummyModel.register_for_auto_class("AutoModel")
|
||||
self.assertEqual(DummyModel._auto_class, "AutoModel")
|
||||
|
||||
def test_register_for_auto_class_rejects_unsupported(self):
|
||||
class DummyModel(ModelMixin, ConfigMixin):
|
||||
config_name = "config.json"
|
||||
|
||||
with self.assertRaises(ValueError, msg="Only 'AutoModel' is supported"):
|
||||
DummyModel.register_for_auto_class("AutoPipeline")
|
||||
|
||||
def test_auto_map_in_saved_config(self):
|
||||
class DummyModel(ModelMixin, ConfigMixin):
|
||||
config_name = "config.json"
|
||||
|
||||
DummyModel.register_for_auto_class("AutoModel")
|
||||
model = DummyModel()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
model.save_config(tmpdir)
|
||||
config_path = os.path.join(tmpdir, "config.json")
|
||||
with open(config_path, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
self.assertIn("auto_map", config)
|
||||
self.assertIn("AutoModel", config["auto_map"])
|
||||
module_name = DummyModel.__module__.split(".")[-1]
|
||||
self.assertEqual(config["auto_map"]["AutoModel"], f"{module_name}.DummyModel")
|
||||
|
||||
def test_no_auto_map_without_register(self):
|
||||
class DummyModel(ModelMixin, ConfigMixin):
|
||||
config_name = "config.json"
|
||||
|
||||
model = DummyModel()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
model.save_config(tmpdir)
|
||||
config_path = os.path.join(tmpdir, "config.json")
|
||||
with open(config_path, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
self.assertNotIn("auto_map", config)
|
||||
|
||||
@@ -375,7 +375,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
# additionally check if dynamic compilation works.
|
||||
if different_shapes is not None:
|
||||
for height, width in different_shapes:
|
||||
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
|
||||
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
|
||||
_ = model(**new_inputs_dict)
|
||||
else:
|
||||
output0_after = model(**inputs_dict)["sample"]
|
||||
@@ -390,7 +390,7 @@ class LoraHotSwappingForModelTesterMixin:
|
||||
with torch.inference_mode():
|
||||
if different_shapes is not None:
|
||||
for height, width in different_shapes:
|
||||
new_inputs_dict = self.get_dummy_inputs(height=height, width=width)
|
||||
new_inputs_dict = self.prepare_dummy_input(height=height, width=width)
|
||||
_ = model(**new_inputs_dict)
|
||||
else:
|
||||
output1_after = model(**inputs_dict)["sample"]
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Callable
|
||||
|
||||
@@ -351,33 +349,6 @@ class ModularPipelineTesterMixin:
|
||||
|
||||
assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
|
||||
|
||||
def test_modular_index_consistency(self):
|
||||
pipe = self.get_pipeline()
|
||||
components_spec = pipe._component_specs
|
||||
components = sorted(components_spec.keys())
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
pipe.save_pretrained(tmpdir)
|
||||
index_file = os.path.join(tmpdir, "modular_model_index.json")
|
||||
assert os.path.exists(index_file)
|
||||
|
||||
with open(index_file) as f:
|
||||
index_contents = json.load(f)
|
||||
|
||||
compulsory_keys = {"_blocks_class_name", "_class_name", "_diffusers_version"}
|
||||
for k in compulsory_keys:
|
||||
assert k in index_contents
|
||||
|
||||
to_check_attrs = {"pretrained_model_name_or_path", "revision", "subfolder"}
|
||||
for component in components:
|
||||
spec = components_spec[component]
|
||||
for attr in to_check_attrs:
|
||||
if getattr(spec, "pretrained_model_name_or_path", None) is not None:
|
||||
for attr in to_check_attrs:
|
||||
assert component in index_contents, f"{component} should be present in index but isn't."
|
||||
attr_value_from_index = index_contents[component][2][attr]
|
||||
assert getattr(spec, attr) == attr_value_from_index
|
||||
|
||||
def test_workflow_map(self):
|
||||
blocks = self.pipeline_blocks_class()
|
||||
if blocks._workflow_map is None:
|
||||
@@ -483,7 +454,8 @@ class TestModularModelCardContent:
|
||||
"blocks_description",
|
||||
"components_description",
|
||||
"configs_section",
|
||||
"io_specification_section",
|
||||
"inputs_description",
|
||||
"outputs_description",
|
||||
"trigger_inputs_section",
|
||||
"tags",
|
||||
]
|
||||
@@ -580,19 +552,18 @@ class TestModularModelCardContent:
|
||||
blocks = self.create_mock_blocks(inputs=inputs)
|
||||
content = generate_modular_model_card_content(blocks)
|
||||
|
||||
io_section = content["io_specification_section"]
|
||||
assert "**Inputs:**" in io_section
|
||||
assert "prompt" in io_section
|
||||
assert "num_steps" in io_section
|
||||
assert "*optional*" in io_section
|
||||
assert "defaults to `50`" in io_section
|
||||
assert "**Required:**" in content["inputs_description"]
|
||||
assert "**Optional:**" in content["inputs_description"]
|
||||
assert "prompt" in content["inputs_description"]
|
||||
assert "num_steps" in content["inputs_description"]
|
||||
assert "default: `50`" in content["inputs_description"]
|
||||
|
||||
def test_inputs_description_empty(self):
|
||||
"""Test handling of pipelines without specific inputs."""
|
||||
blocks = self.create_mock_blocks(inputs=[])
|
||||
content = generate_modular_model_card_content(blocks)
|
||||
|
||||
assert "No specific inputs defined" in content["io_specification_section"]
|
||||
assert "No specific inputs defined" in content["inputs_description"]
|
||||
|
||||
def test_outputs_description_formatting(self):
|
||||
"""Test that outputs are correctly formatted."""
|
||||
@@ -602,16 +573,15 @@ class TestModularModelCardContent:
|
||||
blocks = self.create_mock_blocks(outputs=outputs)
|
||||
content = generate_modular_model_card_content(blocks)
|
||||
|
||||
io_section = content["io_specification_section"]
|
||||
assert "images" in io_section
|
||||
assert "Generated images" in io_section
|
||||
assert "images" in content["outputs_description"]
|
||||
assert "Generated images" in content["outputs_description"]
|
||||
|
||||
def test_outputs_description_empty(self):
|
||||
"""Test handling of pipelines without specific outputs."""
|
||||
blocks = self.create_mock_blocks(outputs=[])
|
||||
content = generate_modular_model_card_content(blocks)
|
||||
|
||||
assert "Standard pipeline outputs" in content["io_specification_section"]
|
||||
assert "Standard pipeline outputs" in content["outputs_description"]
|
||||
|
||||
def test_trigger_inputs_section_with_triggers(self):
|
||||
"""Test that trigger inputs section is generated when present."""
|
||||
@@ -629,6 +599,35 @@ class TestModularModelCardContent:
|
||||
|
||||
assert content["trigger_inputs_section"] == ""
|
||||
|
||||
def test_blocks_description_with_sub_blocks(self):
|
||||
"""Test that blocks with sub-blocks are correctly described."""
|
||||
|
||||
class MockBlockWithSubBlocks:
|
||||
def __init__(self):
|
||||
self.__class__.__name__ = "ParentBlock"
|
||||
self.description = "Parent block"
|
||||
self.sub_blocks = {
|
||||
"child1": self.create_child_block("ChildBlock1", "Child 1 description"),
|
||||
"child2": self.create_child_block("ChildBlock2", "Child 2 description"),
|
||||
}
|
||||
|
||||
def create_child_block(self, name, desc):
|
||||
class ChildBlock:
|
||||
def __init__(self):
|
||||
self.__class__.__name__ = name
|
||||
self.description = desc
|
||||
|
||||
return ChildBlock()
|
||||
|
||||
blocks = self.create_mock_blocks()
|
||||
blocks.sub_blocks["parent"] = MockBlockWithSubBlocks()
|
||||
|
||||
content = generate_modular_model_card_content(blocks)
|
||||
|
||||
assert "parent" in content["blocks_description"]
|
||||
assert "child1" in content["blocks_description"]
|
||||
assert "child2" in content["blocks_description"]
|
||||
|
||||
def test_model_description_includes_block_count(self):
|
||||
"""Test that model description includes the number of blocks."""
|
||||
blocks = self.create_mock_blocks(num_blocks=5)
|
||||
@@ -687,18 +686,6 @@ class TestLoadComponentsSkipBehavior:
|
||||
assert pipe.unet is not None
|
||||
assert getattr(pipe, "vae", None) is None
|
||||
|
||||
def test_load_components_selective_loading_incremental(self):
|
||||
"""Loading a subset of components should not affect already-loaded components."""
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
|
||||
pipe.load_components(names="unet", torch_dtype=torch.float32)
|
||||
pipe.load_components(names="text_encoder", torch_dtype=torch.float32)
|
||||
|
||||
assert hasattr(pipe, "unet")
|
||||
assert pipe.unet is not None
|
||||
assert hasattr(pipe, "text_encoder")
|
||||
assert pipe.text_encoder is not None
|
||||
|
||||
def test_load_components_skips_invalid_pretrained_path(self):
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
|
||||
@@ -712,133 +699,3 @@ class TestLoadComponentsSkipBehavior:
|
||||
|
||||
# Verify test_component was not loaded
|
||||
assert not hasattr(pipe, "test_component") or pipe.test_component is None
|
||||
|
||||
|
||||
class TestCustomModelSavePretrained:
|
||||
def test_save_pretrained_updates_index_for_local_model(self, tmp_path):
|
||||
"""When a component without _diffusers_load_id (custom/local model) is saved,
|
||||
modular_model_index.json should point to the save directory."""
|
||||
import json
|
||||
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
pipe.unet._diffusers_load_id = "null"
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir)
|
||||
|
||||
with open(os.path.join(save_dir, "modular_model_index.json")) as f:
|
||||
index = json.load(f)
|
||||
|
||||
_library, _cls, unet_spec = index["unet"]
|
||||
assert unet_spec["pretrained_model_name_or_path"] == save_dir
|
||||
assert unet_spec["subfolder"] == "unet"
|
||||
|
||||
_library, _cls, vae_spec = index["vae"]
|
||||
assert vae_spec["pretrained_model_name_or_path"] == "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
|
||||
|
||||
def test_save_pretrained_roundtrip_with_local_model(self, tmp_path):
|
||||
"""A pipeline with a custom/local model should be saveable and re-loadable with identical outputs."""
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
pipe.unet._diffusers_load_id = "null"
|
||||
|
||||
original_state_dict = pipe.unet.state_dict()
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir)
|
||||
|
||||
loaded_pipe = ModularPipeline.from_pretrained(save_dir)
|
||||
loaded_pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
assert loaded_pipe.unet is not None
|
||||
assert loaded_pipe.unet.__class__.__name__ == pipe.unet.__class__.__name__
|
||||
|
||||
loaded_state_dict = loaded_pipe.unet.state_dict()
|
||||
assert set(original_state_dict.keys()) == set(loaded_state_dict.keys())
|
||||
for key in original_state_dict:
|
||||
assert torch.equal(original_state_dict[key], loaded_state_dict[key]), f"Mismatch in {key}"
|
||||
|
||||
def test_save_pretrained_updates_index_for_model_with_no_load_id(self, tmp_path):
|
||||
"""testing the workflow of update the pipeline with a custom model and save the pipeline,
|
||||
the modular_model_index.json should point to the save directory."""
|
||||
import json
|
||||
|
||||
from diffusers import UNet2DConditionModel
|
||||
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-xl-pipe", subfolder="unet"
|
||||
)
|
||||
assert not hasattr(unet, "_diffusers_load_id")
|
||||
|
||||
pipe.update_components(unet=unet)
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir)
|
||||
|
||||
with open(os.path.join(save_dir, "modular_model_index.json")) as f:
|
||||
index = json.load(f)
|
||||
|
||||
_library, _cls, unet_spec = index["unet"]
|
||||
assert unet_spec["pretrained_model_name_or_path"] == save_dir
|
||||
assert unet_spec["subfolder"] == "unet"
|
||||
|
||||
_library, _cls, vae_spec = index["vae"]
|
||||
assert vae_spec["pretrained_model_name_or_path"] == "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
|
||||
|
||||
def test_save_pretrained_overwrite_modular_index(self, tmp_path):
|
||||
"""With overwrite_modular_index=True, all component references should point to the save directory."""
|
||||
import json
|
||||
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
save_dir = str(tmp_path / "my-pipeline")
|
||||
pipe.save_pretrained(save_dir, overwrite_modular_index=True)
|
||||
|
||||
with open(os.path.join(save_dir, "modular_model_index.json")) as f:
|
||||
index = json.load(f)
|
||||
|
||||
for component_name in ["unet", "vae", "text_encoder", "text_encoder_2"]:
|
||||
if component_name not in index:
|
||||
continue
|
||||
_library, _cls, spec = index[component_name]
|
||||
assert spec["pretrained_model_name_or_path"] == save_dir, (
|
||||
f"{component_name} should point to save dir but got {spec['pretrained_model_name_or_path']}"
|
||||
)
|
||||
assert spec["subfolder"] == component_name
|
||||
|
||||
loaded_pipe = ModularPipeline.from_pretrained(save_dir)
|
||||
loaded_pipe.load_components(torch_dtype=torch.float32)
|
||||
|
||||
assert loaded_pipe.unet is not None
|
||||
assert loaded_pipe.vae is not None
|
||||
|
||||
|
||||
class TestModularPipelineInitFallback:
|
||||
"""Test that ModularPipeline.__init__ falls back to default_blocks_name when
|
||||
_blocks_class_name is a base class (e.g. SequentialPipelineBlocks saved by from_blocks_dict)."""
|
||||
|
||||
def test_init_fallback_when_blocks_class_name_is_base_class(self, tmp_path):
|
||||
# 1. Load pipeline and get a workflow (returns a base SequentialPipelineBlocks)
|
||||
pipe = ModularPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
t2i_blocks = pipe.blocks.get_workflow("text2image")
|
||||
assert t2i_blocks.__class__.__name__ == "SequentialPipelineBlocks"
|
||||
|
||||
# 2. Use init_pipeline to create a new pipeline from the workflow blocks
|
||||
t2i_pipe = t2i_blocks.init_pipeline("hf-internal-testing/tiny-stable-diffusion-xl-pipe")
|
||||
|
||||
# 3. Save and reload — the saved config will have _blocks_class_name="SequentialPipelineBlocks"
|
||||
save_dir = str(tmp_path / "pipeline")
|
||||
t2i_pipe.save_pretrained(save_dir)
|
||||
loaded_pipe = ModularPipeline.from_pretrained(save_dir)
|
||||
|
||||
# 4. Verify it fell back to default_blocks_name and has correct blocks
|
||||
assert loaded_pipe.__class__.__name__ == pipe.__class__.__name__
|
||||
assert loaded_pipe._blocks.__class__.__name__ == pipe._blocks.__class__.__name__
|
||||
assert len(loaded_pipe._blocks.sub_blocks) == len(pipe._blocks.sub_blocks)
|
||||
|
||||
@@ -192,156 +192,6 @@ class TestModularCustomBlocks:
|
||||
assert len(pipe.components) == 1
|
||||
assert pipe.component_names[0] == "transformer"
|
||||
|
||||
def test_trust_remote_code_not_propagated_to_external_repo(self):
|
||||
"""When a modular pipeline repo references a component from an external repo that has custom
|
||||
code (auto_map in config), calling load_components(trust_remote_code=True) should NOT
|
||||
propagate trust_remote_code to that external component. The external component should fail
|
||||
to load."""
|
||||
|
||||
from diffusers import ModularPipeline
|
||||
|
||||
CUSTOM_MODEL_CODE = (
|
||||
"import torch\n"
|
||||
"from diffusers import ModelMixin, ConfigMixin\n"
|
||||
"from diffusers.configuration_utils import register_to_config\n"
|
||||
"\n"
|
||||
"class CustomModel(ModelMixin, ConfigMixin):\n"
|
||||
" @register_to_config\n"
|
||||
" def __init__(self, hidden_size=8):\n"
|
||||
" super().__init__()\n"
|
||||
" self.linear = torch.nn.Linear(hidden_size, hidden_size)\n"
|
||||
"\n"
|
||||
" def forward(self, x):\n"
|
||||
" return self.linear(x)\n"
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as external_repo_dir, tempfile.TemporaryDirectory() as pipeline_repo_dir:
|
||||
# Step 1: Create an external model repo with custom code (requires trust_remote_code)
|
||||
with open(os.path.join(external_repo_dir, "modeling.py"), "w") as f:
|
||||
f.write(CUSTOM_MODEL_CODE)
|
||||
|
||||
config = {
|
||||
"_class_name": "CustomModel",
|
||||
"_diffusers_version": "0.0.0",
|
||||
"auto_map": {"AutoModel": "modeling.CustomModel"},
|
||||
"hidden_size": 8,
|
||||
}
|
||||
with open(os.path.join(external_repo_dir, "config.json"), "w") as f:
|
||||
json.dump(config, f)
|
||||
|
||||
torch.save({}, os.path.join(external_repo_dir, "diffusion_pytorch_model.bin"))
|
||||
|
||||
# Step 2: Create a custom block that references the external repo.
|
||||
# Define both the class (for direct use) and its code string (for block.py).
|
||||
class ExternalRefBlock(ModularPipelineBlocks):
|
||||
@property
|
||||
def expected_components(self):
|
||||
return [
|
||||
ComponentSpec(
|
||||
"custom_model",
|
||||
AutoModel,
|
||||
pretrained_model_name_or_path=external_repo_dir,
|
||||
)
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [InputParam("prompt", type_hint=str, required=True)]
|
||||
|
||||
@property
|
||||
def intermediate_inputs(self) -> List[InputParam]:
|
||||
return []
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [OutputParam("output", type_hint=str)]
|
||||
|
||||
def __call__(self, components, state: PipelineState) -> PipelineState:
|
||||
block_state = self.get_block_state(state)
|
||||
block_state.output = "test"
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
|
||||
EXTERNAL_REF_BLOCK_CODE_STR = (
|
||||
"from typing import List\n"
|
||||
"from diffusers import AutoModel\n"
|
||||
"from diffusers.modular_pipelines import (\n"
|
||||
" ComponentSpec,\n"
|
||||
" InputParam,\n"
|
||||
" ModularPipelineBlocks,\n"
|
||||
" OutputParam,\n"
|
||||
" PipelineState,\n"
|
||||
")\n"
|
||||
"\n"
|
||||
"class ExternalRefBlock(ModularPipelineBlocks):\n"
|
||||
" @property\n"
|
||||
" def expected_components(self):\n"
|
||||
" return [\n"
|
||||
" ComponentSpec(\n"
|
||||
' "custom_model",\n'
|
||||
" AutoModel,\n"
|
||||
f' pretrained_model_name_or_path="{external_repo_dir}",\n'
|
||||
" )\n"
|
||||
" ]\n"
|
||||
"\n"
|
||||
" @property\n"
|
||||
" def inputs(self) -> List[InputParam]:\n"
|
||||
' return [InputParam("prompt", type_hint=str, required=True)]\n'
|
||||
"\n"
|
||||
" @property\n"
|
||||
" def intermediate_inputs(self) -> List[InputParam]:\n"
|
||||
" return []\n"
|
||||
"\n"
|
||||
" @property\n"
|
||||
" def intermediate_outputs(self) -> List[OutputParam]:\n"
|
||||
' return [OutputParam("output", type_hint=str)]\n'
|
||||
"\n"
|
||||
" def __call__(self, components, state: PipelineState) -> PipelineState:\n"
|
||||
" block_state = self.get_block_state(state)\n"
|
||||
' block_state.output = "test"\n'
|
||||
" self.set_block_state(state, block_state)\n"
|
||||
" return components, state\n"
|
||||
)
|
||||
|
||||
# Save the block config, write block.py, then load back via from_pretrained
|
||||
block = ExternalRefBlock()
|
||||
block.save_pretrained(pipeline_repo_dir)
|
||||
|
||||
# auto_map will reference the module name derived from ExternalRefBlock.__module__,
|
||||
# which is "test_modular_pipelines_custom_blocks". Write the code file with that name.
|
||||
code_path = os.path.join(pipeline_repo_dir, "test_modular_pipelines_custom_blocks.py")
|
||||
with open(code_path, "w") as f:
|
||||
f.write(EXTERNAL_REF_BLOCK_CODE_STR)
|
||||
|
||||
block = ModularPipelineBlocks.from_pretrained(pipeline_repo_dir, trust_remote_code=True)
|
||||
pipe = block.init_pipeline()
|
||||
pipe.save_pretrained(pipeline_repo_dir)
|
||||
|
||||
# Step 3: Load the pipeline from the saved directory.
|
||||
loaded_pipe = ModularPipeline.from_pretrained(pipeline_repo_dir, trust_remote_code=True)
|
||||
|
||||
assert loaded_pipe._pretrained_model_name_or_path == pipeline_repo_dir
|
||||
assert loaded_pipe._component_specs["custom_model"].pretrained_model_name_or_path == external_repo_dir
|
||||
assert getattr(loaded_pipe, "custom_model", None) is None
|
||||
|
||||
# Step 4a: load_components WITHOUT trust_remote_code.
|
||||
# It should still fail
|
||||
loaded_pipe.load_components()
|
||||
assert getattr(loaded_pipe, "custom_model", None) is None
|
||||
|
||||
# Step 4b: load_components with trust_remote_code=True.
|
||||
# trust_remote_code should be stripped for the external component, so it fails.
|
||||
# The warning should contain guidance about manually loading with trust_remote_code.
|
||||
loaded_pipe.load_components(trust_remote_code=True)
|
||||
assert getattr(loaded_pipe, "custom_model", None) is None
|
||||
|
||||
# Step 4c: Manually load with AutoModel and update_components — this should work.
|
||||
from diffusers import AutoModel
|
||||
|
||||
custom_model = AutoModel.from_pretrained(external_repo_dir, trust_remote_code=True)
|
||||
loaded_pipe.update_components(custom_model=custom_model)
|
||||
assert getattr(loaded_pipe, "custom_model", None) is not None
|
||||
|
||||
def test_custom_block_loads_from_hub(self):
|
||||
repo_id = "hf-internal-testing/tiny-modular-diffusers-block"
|
||||
block = ModularPipelineBlocks.from_pretrained(repo_id, trust_remote_code=True)
|
||||
|
||||
@@ -282,8 +282,6 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
clap_prompt_embeds = audioldm_pipe.text_encoder.get_text_features(text_inputs)
|
||||
if hasattr(clap_prompt_embeds, "pooler_output"):
|
||||
clap_prompt_embeds = clap_prompt_embeds.pooler_output
|
||||
clap_prompt_embeds = clap_prompt_embeds[:, None, :]
|
||||
|
||||
text_inputs = audioldm_pipe.tokenizer_2(
|
||||
@@ -343,8 +341,6 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
clap_prompt_embeds = audioldm_pipe.text_encoder.get_text_features(text_inputs)
|
||||
if hasattr(clap_prompt_embeds, "pooler_output"):
|
||||
clap_prompt_embeds = clap_prompt_embeds.pooler_output
|
||||
clap_prompt_embeds = clap_prompt_embeds[:, None, :]
|
||||
|
||||
text_inputs = audioldm_pipe.tokenizer_2(
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import AutoConfig, T5EncoderModel, T5TokenizerFast
|
||||
from transformers import T5EncoderModel, T5TokenizerFast
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -89,8 +89,7 @@ class BriaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = T5TokenizerFast.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -2,7 +2,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, ChromaPipeline, ChromaTransformer2DModel, FlowMatchEulerDiscreteScheduler
|
||||
|
||||
@@ -41,8 +41,7 @@ class ChromaPipelineFastTests(
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, ChromaImg2ImgPipeline, ChromaTransformer2DModel, FlowMatchEulerDiscreteScheduler
|
||||
|
||||
@@ -42,8 +42,7 @@ class ChromaImg2ImgPipelineFastTests(
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@ import unittest
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPImageProcessor,
|
||||
CLIPVisionConfig,
|
||||
@@ -72,8 +71,7 @@ class ChronoEditPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
torch.manual_seed(0)
|
||||
# TODO: impl FlowDPMSolverMultistepScheduler
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransformer3DModel, DDIMScheduler
|
||||
|
||||
@@ -117,8 +117,7 @@ class CogVideoXPipelineFastTests(
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -236,9 +235,6 @@ class CogVideoXPipelineFastTests(
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXFunControlPipeline, CogVideoXTransformer3DModel, DDIMScheduler
|
||||
|
||||
@@ -104,8 +104,7 @@ class CogVideoXFunControlPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -229,9 +228,6 @@ class CogVideoXFunControlPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
@@ -113,8 +113,7 @@ class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -238,9 +237,6 @@ class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXVideoToVideoPipeline, DDIMScheduler
|
||||
|
||||
@@ -99,8 +99,7 @@ class CogVideoXVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, CogVideoXDDIMScheduler, CogView3PlusPipeline, CogView3PlusTransformer2DModel
|
||||
|
||||
@@ -89,8 +89,7 @@ class CogView3PlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = CogVideoXDDIMScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -108,7 +108,7 @@ class CogView4PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "dance monkey",
|
||||
"negative_prompt": "bad",
|
||||
"negative_prompt": "",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
@@ -122,8 +122,7 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -249,9 +248,6 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import AutoConfig, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -97,8 +97,7 @@ class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin, Fl
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = T5TokenizerFast.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -2,7 +2,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -13,7 +13,9 @@ from diffusers import (
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...testing_utils import torch_device
|
||||
from ...testing_utils import (
|
||||
torch_device,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineTesterMixin, check_qkv_fused_layers_exist
|
||||
|
||||
|
||||
@@ -68,8 +70,7 @@ class FluxControlNetImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMi
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,15 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
# torch_device, # {{ edit_1 }} Removed unused import
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -14,7 +22,11 @@ from diffusers import (
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...testing_utils import enable_full_determinism, floats_tensor, torch_device
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
torch_device,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
@@ -73,8 +85,7 @@ class FluxControlNetInpaintPipelineTests(unittest.TestCase, PipelineTesterMixin)
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, BertModel, T5EncoderModel
|
||||
from transformers import AutoTokenizer, BertModel, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -96,10 +96,7 @@ class HunyuanDiTControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMix
|
||||
scheduler = DDPMScheduler()
|
||||
text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -17,14 +17,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -35,7 +28,10 @@ from diffusers import (
|
||||
from diffusers.models import SD3ControlNetModel
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...testing_utils import enable_full_determinism, torch_device
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
torch_device,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
@@ -107,8 +103,7 @@ class StableDiffusion3ControlInpaintNetPipelineFastTests(unittest.TestCase, Pipe
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -18,14 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -124,8 +117,7 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCosmos, CosmosTextToWorldPipeline, CosmosTransformer3DModel, EDMEulerScheduler
|
||||
|
||||
@@ -107,8 +107,7 @@ class CosmosTextToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
rho=7.0,
|
||||
final_sigmas_type="sigma_min",
|
||||
)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -233,9 +232,6 @@ class CosmosTextToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -55,7 +55,7 @@ class Cosmos2_5_TransferWrapper(Cosmos2_5_TransferPipeline):
|
||||
class Cosmos2_5_TransferPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = Cosmos2_5_TransferWrapper
|
||||
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"controls"})
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
required_optional_params = frozenset(
|
||||
@@ -176,19 +176,15 @@ class Cosmos2_5_TransferPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
controls_generator = torch.Generator(device="cpu").manual_seed(seed)
|
||||
|
||||
inputs = {
|
||||
"prompt": "dance monkey",
|
||||
"negative_prompt": "bad quality",
|
||||
"controls": [torch.randn(3, 32, 32, generator=controls_generator) for _ in range(5)],
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 3.0,
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"num_frames": 3,
|
||||
"num_frames_per_chunk": 16,
|
||||
"max_sequence_length": 16,
|
||||
"output_type": "pt",
|
||||
}
|
||||
@@ -216,56 +212,6 @@ class Cosmos2_5_TransferPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
self.assertEqual(generated_video.shape, (3, 3, 32, 32))
|
||||
self.assertTrue(torch.isfinite(generated_video).all())
|
||||
|
||||
def test_inference_autoregressive_multi_chunk(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["num_frames"] = 5
|
||||
inputs["num_frames_per_chunk"] = 3
|
||||
inputs["num_ar_conditional_frames"] = 1
|
||||
|
||||
video = pipe(**inputs).frames
|
||||
generated_video = video[0]
|
||||
self.assertEqual(generated_video.shape, (5, 3, 32, 32))
|
||||
self.assertTrue(torch.isfinite(generated_video).all())
|
||||
|
||||
def test_inference_autoregressive_multi_chunk_no_condition_frames(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["num_frames"] = 5
|
||||
inputs["num_frames_per_chunk"] = 3
|
||||
inputs["num_ar_conditional_frames"] = 0
|
||||
|
||||
video = pipe(**inputs).frames
|
||||
generated_video = video[0]
|
||||
self.assertEqual(generated_video.shape, (5, 3, 32, 32))
|
||||
self.assertTrue(torch.isfinite(generated_video).all())
|
||||
|
||||
def test_num_frames_per_chunk_above_rope_raises(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["num_frames_per_chunk"] = 17
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "too large for RoPE setting"):
|
||||
pipe(**inputs)
|
||||
|
||||
def test_inference_with_controls(self):
|
||||
"""Test inference with control inputs (ControlNet)."""
|
||||
device = "cpu"
|
||||
@@ -276,13 +222,13 @@ class Cosmos2_5_TransferPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["controls"] = [torch.randn(3, 32, 32) for _ in range(5)] # list of 5 frames (C, H, W)
|
||||
# Add control video input - should be a video tensor
|
||||
inputs["controls"] = [torch.randn(3, 3, 32, 32)] # num_frames, channels, height, width
|
||||
inputs["controls_conditioning_scale"] = 1.0
|
||||
inputs["num_frames"] = None
|
||||
|
||||
video = pipe(**inputs).frames
|
||||
generated_video = video[0]
|
||||
self.assertEqual(generated_video.shape, (5, 3, 32, 32))
|
||||
self.assertEqual(generated_video.shape, (3, 3, 32, 32))
|
||||
self.assertTrue(torch.isfinite(generated_video).all())
|
||||
|
||||
def test_callback_inputs(self):
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
@@ -95,8 +95,7 @@ class Cosmos2TextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(use_karras_sigmas=True)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -21,7 +21,7 @@ import unittest
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
@@ -96,8 +96,7 @@ class Cosmos2VideoToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(use_karras_sigmas=True)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -21,7 +21,7 @@ import unittest
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLCosmos, CosmosTransformer3DModel, CosmosVideoToWorldPipeline, EDMEulerScheduler
|
||||
|
||||
@@ -108,8 +108,7 @@ class CosmosVideoToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
rho=7.0,
|
||||
final_sigmas_type="sigma_min",
|
||||
)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
@@ -246,9 +245,6 @@ class CosmosVideoToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
|
||||
@@ -2,7 +2,7 @@ import tempfile
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import DDPMScheduler, UNet2DConditionModel
|
||||
from diffusers.models.attention_processor import AttnAddedKVProcessor
|
||||
@@ -18,8 +18,7 @@ from ..test_pipelines_common import to_np
|
||||
class IFPipelineTesterMixin:
|
||||
def _get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -76,8 +75,7 @@ class IFPipelineTesterMixin:
|
||||
|
||||
def _get_superresolution_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -252,9 +250,6 @@ class IFPipelineTesterMixin:
|
||||
# This should be handled in the base test and then this method can be removed.
|
||||
def _test_save_load_local(self):
|
||||
components = self.get_dummy_components()
|
||||
for key in components:
|
||||
if "text_encoder" in key and hasattr(components[key], "eval"):
|
||||
components[key].eval()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
@@ -18,7 +18,9 @@ import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import IFPipeline
|
||||
from diffusers import (
|
||||
IFPipeline,
|
||||
)
|
||||
from diffusers.models.attention_processor import AttnAddedKVProcessor
|
||||
from diffusers.utils.import_utils import is_xformers_available
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from huggingface_hub import hf_hub_download
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -93,8 +93,7 @@ class FluxPipelineFastTests(
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxControlPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -53,8 +53,7 @@ class FluxControlPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -57,8 +57,7 @@ class FluxControlImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -58,8 +58,7 @@ class FluxControlInpaintPipelineFastTests(unittest.TestCase, PipelineTesterMixin
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxFillPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -58,8 +58,7 @@ class FluxFillPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxImg2ImgPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -55,8 +55,7 @@ class FluxImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxI
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxInpaintPipeline, FluxTransformer2DModel
|
||||
|
||||
@@ -55,8 +55,7 @@ class FluxInpaintPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxI
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -79,8 +79,7 @@ class FluxKontextPipelineFastTests(
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -79,8 +79,7 @@ class FluxKontextInpaintPipelineFastTests(
|
||||
text_encoder = CLIPTextModel(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
@@ -16,7 +16,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, GlmImagePipeline, GlmImageTransformer2DModel
|
||||
from diffusers.utils import is_transformers_version
|
||||
@@ -57,8 +57,7 @@ class GlmImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
glm_config = GlmImageConfig(
|
||||
|
||||
@@ -18,7 +18,6 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
@@ -95,8 +94,7 @@ class HiDreamImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_3 = T5EncoderModel(config)
|
||||
text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_4 = LlamaForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM")
|
||||
@@ -151,12 +149,12 @@ class HiDreamImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(generated_image.shape, (128, 128, 3))
|
||||
|
||||
# fmt: off
|
||||
expected_slice = np.array([0.4501, 0.5256, 0.4207, 0.5783, 0.4842, 0.4833, 0.4441, 0.5112, 0.6587, 0.3169, 0.7308, 0.5927, 0.6251, 0.5509, 0.5355, 0.5969])
|
||||
expected_slice = np.array([0.4507, 0.5256, 0.4205, 0.5791, 0.4848, 0.4831, 0.4443, 0.5107, 0.6586, 0.3163, 0.7318, 0.5933, 0.6252, 0.5512, 0.5357, 0.5983])
|
||||
# fmt: on
|
||||
|
||||
generated_slice = generated_image.flatten()
|
||||
generated_slice = np.concatenate([generated_slice[:8], generated_slice[-8:]])
|
||||
self.assertTrue(np.allclose(generated_slice, expected_slice, atol=5e-3))
|
||||
self.assertTrue(np.allclose(generated_slice, expected_slice, atol=1e-3))
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=3e-4)
|
||||
|
||||
@@ -223,7 +223,7 @@ class HunyuanImagePipelineFastTests(
|
||||
self.assertEqual(generated_image.shape, (3, 16, 16))
|
||||
|
||||
expected_slice_np = np.array(
|
||||
[0.6068114, 0.48716035, 0.5984431, 0.60241306, 0.48849544, 0.5624479, 0.53696984, 0.58964247, 0.54248774]
|
||||
[0.61494756, 0.49616697, 0.60327923, 0.6115793, 0.49047345, 0.56977504, 0.53066164, 0.58880305, 0.5570612]
|
||||
)
|
||||
output_slice = generated_image[0, -3:, -3:].flatten().cpu().numpy()
|
||||
|
||||
|
||||
@@ -233,7 +233,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
self.assertEqual(generated_video.shape, (5, 3, 16, 16))
|
||||
|
||||
# fmt: off
|
||||
expected_slice = torch.tensor([0.4441, 0.4790, 0.4485, 0.5748, 0.3539, 0.1553, 0.2707, 0.3594, 0.5331, 0.6645, 0.6799, 0.5257, 0.5092, 0.3450, 0.4276, 0.4127])
|
||||
expected_slice = torch.tensor([0.444, 0.479, 0.4485, 0.5752, 0.3539, 0.1548, 0.2706, 0.3593, 0.5323, 0.6635, 0.6795, 0.5255, 0.5091, 0.345, 0.4276, 0.4128])
|
||||
# fmt: on
|
||||
|
||||
generated_slice = generated_video.flatten()
|
||||
|
||||
@@ -15,14 +15,7 @@
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
ByT5Tokenizer,
|
||||
Qwen2_5_VLTextConfig,
|
||||
Qwen2_5_VLTextModel,
|
||||
Qwen2Tokenizer,
|
||||
T5EncoderModel,
|
||||
)
|
||||
from transformers import ByT5Tokenizer, Qwen2_5_VLTextConfig, Qwen2_5_VLTextModel, Qwen2Tokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLHunyuanVideo15,
|
||||
@@ -121,8 +114,7 @@ class HunyuanVideo15PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer_2 = ByT5Tokenizer()
|
||||
|
||||
guider = ClassifierFreeGuidance(guidance_scale=1.0)
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, BertModel, T5EncoderModel
|
||||
from transformers import AutoTokenizer, BertModel, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKL, DDPMScheduler, HunyuanDiT2DModel, HunyuanDiTPipeline
|
||||
|
||||
@@ -74,9 +74,7 @@ class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
scheduler = DDPMScheduler()
|
||||
text_encoder = BertModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertModel")
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder_2 = T5EncoderModel(config)
|
||||
text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoPipelineForImage2Image,
|
||||
@@ -108,8 +108,7 @@ class Kandinsky3PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
torch.manual_seed(0)
|
||||
movq = self.dummy_movq
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config).eval()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -156,9 +155,9 @@ class Kandinsky3PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
assert image.shape == (1, 16, 16, 3)
|
||||
|
||||
expected_slice = np.array([0.3944, 0.3680, 0.4842, 0.5333, 0.4412, 0.4812, 0.5089, 0.5381, 0.5578])
|
||||
expected_slice = np.array([0.3768, 0.4373, 0.4865, 0.4890, 0.4299, 0.5122, 0.4921, 0.4924, 0.5599])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1, (
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
|
||||
)
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoPipelineForImage2Image,
|
||||
@@ -119,8 +119,7 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
torch.manual_seed(0)
|
||||
movq = self.dummy_movq
|
||||
torch.manual_seed(0)
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config).eval()
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
torch.manual_seed(0)
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
@@ -156,7 +155,10 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
return inputs
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
super().test_dict_tuple_outputs_equivalent()
|
||||
expected_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_slice = np.array([0.5762, 0.6112, 0.4150, 0.6018, 0.6167, 0.4626, 0.5426, 0.5641, 0.6536])
|
||||
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
|
||||
|
||||
def test_kandinsky3_img2img(self):
|
||||
device = "cpu"
|
||||
@@ -175,9 +177,11 @@ class Kandinsky3Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.5725, 0.6248, 0.4355, 0.5732, 0.6105, 0.5267, 0.5470, 0.5512, 0.6618])
|
||||
expected_slice = np.array(
|
||||
[0.576259, 0.6132097, 0.41703486, 0.603196, 0.62062526, 0.4655338, 0.5434324, 0.5660727, 0.65433365]
|
||||
)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1, (
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
|
||||
)
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
@@ -109,8 +109,7 @@ class LattePipelineFastTests(
|
||||
vae = AutoencoderKL()
|
||||
|
||||
scheduler = DDIMScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLLTXVideo, FlowMatchEulerDiscreteScheduler, LTXPipeline, LTXVideoTransformer3DModel
|
||||
|
||||
@@ -88,8 +88,7 @@ class LTXPipelineFastTests(PipelineTesterMixin, FirstBlockCacheTesterMixin, unit
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
@@ -17,7 +17,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLLTXVideo,
|
||||
@@ -92,8 +92,7 @@ class LTXConditionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler()
|
||||
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
text_encoder = T5EncoderModel(config)
|
||||
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
|
||||
|
||||
components = {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user