fix group offloading when using torchao

2026-03-19 06:58:01 +08:00 · 2026-03-17 10:40:03 +05:30
2 changed files with 67 additions and 218 deletions
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -22,7 +22,7 @@ from typing import Set
 import safetensors.torch
 import torch

-from ..utils import get_logger, is_accelerate_available
+from ..utils import get_logger, is_accelerate_available, is_torchao_available
 from ._common import _GO_LC_SUPPORTED_PYTORCH_LAYERS
 from .hooks import HookRegistry, ModelHook

@@ -35,6 +35,41 @@ if is_accelerate_available():
 logger = get_logger(__name__)  # pylint: disable=invalid-name


+def _is_torchao_tensor(tensor: torch.Tensor) -> bool:
+    """Check if a tensor is a TorchAO quantized tensor subclass."""
+    if not is_torchao_available():
+        return False
+    from torchao.utils import TorchAOBaseTensor
+
+    return isinstance(tensor, TorchAOBaseTensor)
+
+
+def _get_torchao_inner_tensor_names(tensor: torch.Tensor) -> list[str]:
+    """Get names of all internal tensor data attributes from a TorchAO tensor."""
+    cls = type(tensor)
+    names = list(getattr(cls, "tensor_data_names", []))
+    for attr_name in getattr(cls, "optional_tensor_data_names", []):
+        if getattr(tensor, attr_name, None) is not None:
+            names.append(attr_name)
+    return names
+
+
+def _update_torchao_tensor_in_place(param: torch.Tensor, source: torch.Tensor) -> None:
+    """Update internal tensor data of a TorchAO parameter in-place from source.
+
+    Must operate on the parameter/buffer object directly (not ``param.data``) because ``_make_wrapper_subclass``
+    returns a fresh wrapper from ``.data`` each time, so attribute mutations on ``.data`` are lost.
+    """
+    for attr_name in _get_torchao_inner_tensor_names(source):
+        setattr(param, attr_name, getattr(source, attr_name))
+
+
+def _record_stream_torchao_tensor(param: torch.Tensor, stream) -> None:
+    """Record stream for all internal tensors of a TorchAO parameter."""
+    for attr_name in _get_torchao_inner_tensor_names(param):
+        getattr(param, attr_name).record_stream(stream)
+
+
 # fmt: off
 _GROUP_OFFLOADING = "group_offloading"
 _LAYER_EXECUTION_TRACKER = "layer_execution_tracker"
@@ -157,9 +192,16 @@ class ModuleGroup:
            pinned_dict = None

    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream):
-        tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+        moved = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+        if _is_torchao_tensor(tensor):
+            _update_torchao_tensor_in_place(tensor, moved)
+        else:
+            tensor.data = moved
        if self.record_stream:
-            tensor.data.record_stream(default_stream)
+            if _is_torchao_tensor(tensor):
+                _record_stream_torchao_tensor(tensor, default_stream)
+            else:
+                tensor.data.record_stream(default_stream)

    def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None):
        for group_module in self.modules:
@@ -245,18 +287,35 @@ class ModuleGroup:

            for group_module in self.modules:
                for param in group_module.parameters():
-                    param.data = self.cpu_param_dict[param]
+                    if _is_torchao_tensor(param):
+                        _update_torchao_tensor_in_place(param, self.cpu_param_dict[param])
+                    else:
+                        param.data = self.cpu_param_dict[param]
            for param in self.parameters:
-                param.data = self.cpu_param_dict[param]
+                if _is_torchao_tensor(param):
+                    _update_torchao_tensor_in_place(param, self.cpu_param_dict[param])
+                else:
+                    param.data = self.cpu_param_dict[param]
            for buffer in self.buffers:
-                buffer.data = self.cpu_param_dict[buffer]
+                if _is_torchao_tensor(buffer):
+                    _update_torchao_tensor_in_place(buffer, self.cpu_param_dict[buffer])
+                else:
+                    buffer.data = self.cpu_param_dict[buffer]
        else:
            for group_module in self.modules:
                group_module.to(self.offload_device, non_blocking=False)
            for param in self.parameters:
-                param.data = param.data.to(self.offload_device, non_blocking=False)
+                if _is_torchao_tensor(param):
+                    moved = param.data.to(self.offload_device, non_blocking=False)
+                    _update_torchao_tensor_in_place(param, moved)
+                else:
+                    param.data = param.data.to(self.offload_device, non_blocking=False)
            for buffer in self.buffers:
-                buffer.data = buffer.data.to(self.offload_device, non_blocking=False)
+                if _is_torchao_tensor(buffer):
+                    moved = buffer.data.to(self.offload_device, non_blocking=False)
+                    _update_torchao_tensor_in_place(buffer, moved)
+                else:
+                    buffer.data = buffer.data.to(self.offload_device, non_blocking=False)

    @torch.compiler.disable()
    def onload_(self):
--- a/utils/make_tiny_model.py
+++ b/utils/make_tiny_model.py
@@ -1,210 +0,0 @@
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "diffusers",
-#     "torch",
-#     "huggingface_hub",
-#     "accelerate",
-#     "transformers",
-#     "sentencepiece",
-#     "protobuf",
-# ]
-# ///
-
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Utility script to create tiny versions of diffusers models by reducing layer counts.
-
-Can be run locally or submitted as an HF Job via `--launch`.
-
-Usage:
-    # Run locally
-    python make_tiny_model.py --model_repo_id <model_repo_id> --output_repo_id <output_repo_id> [--subfolder transformer] [--num_layers 2]
-
-    # Push to Hub
-    python make_tiny_model.py --model_repo_id <model_repo_id> --output_repo_id <output_repo_id> --push_to_hub --token $HF_TOKEN
-
-    # Submit as an HF Job
-    python make_tiny_model.py --model_repo_id <model_repo_id> --output_repo_id <output_repo_id> --launch [--flavor cpu-basic]
-"""
-
-import argparse
-import os
-import re
-
-
-LAYER_PARAM_PATTERN = re.compile(r"^(num_.*layers?|n_layers|n_refiner_layers)$")
-
-DIM_PARAM_PATTERNS = {
-    re.compile(r"^num_attention_heads$"): 2,
-    re.compile(r"^num_.*attention_heads$"): 2,
-    re.compile(r"^num_key_value_heads$"): 2,
-    re.compile(r"^num_kv_heads$"): 1,
-    re.compile(r"^n_heads$"): 2,
-    re.compile(r"^n_kv_heads$"): 2,
-    re.compile(r"^attention_head_dim$"): 8,
-    re.compile(r"^.*attention_head_dim$"): 4,
-    re.compile(r"^cross_attention_dim.*$"): 8,
-    re.compile(r"^joint_attention_dim$"): 32,
-    re.compile(r"^pooled_projection_dim$"): 32,
-    re.compile(r"^caption_projection_dim$"): 32,
-    re.compile(r"^caption_channels$"): 8,
-    re.compile(r"^cap_feat_dim$"): 16,
-    re.compile(r"^hidden_size$"): 16,
-    re.compile(r"^dim$"): 16,
-    re.compile(r"^.*embed_dim$"): 16,
-    re.compile(r"^.*embed_.*dim$"): 16,
-    re.compile(r"^text_dim$"): 16,
-    re.compile(r"^time_embed_dim$"): 4,
-    re.compile(r"^ffn_dim$"): 32,
-    re.compile(r"^intermediate_size$"): 32,
-    re.compile(r"^sample_size$"): 32,
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Create a tiny version of a diffusers model.")
-    parser.add_argument("--model_repo_id", type=str, required=True, help="HuggingFace repo ID of the source model.")
-    parser.add_argument(
-        "--output_repo_id",
-        type=str,
-        required=True,
-        help="HuggingFace repo ID or local path to save the tiny model to.",
-    )
-    parser.add_argument("--subfolder", type=str, default=None, help="Subfolder within the model repo.")
-    parser.add_argument("--num_layers", type=int, default=2, help="Number of layers to use for the tiny model.")
-    parser.add_argument(
-        "--shrink_dims",
-        action="store_true",
-        help="Also reduce dimension parameters (attention heads, hidden size, embedding dims, etc.).",
-    )
-    parser.add_argument("--push_to_hub", action="store_true", help="Push the tiny model to the HuggingFace Hub.")
-    parser.add_argument(
-        "--token", type=str, default=None, help="HuggingFace token. Defaults to $HF_TOKEN env var if not provided."
-    )
-
-    launch_group = parser.add_argument_group("HF Jobs launch options")
-    launch_group.add_argument("--launch", action="store_true", help="Submit as an HF Job instead of running locally.")
-    launch_group.add_argument("--flavor", type=str, default="cpu-basic", help="HF Jobs hardware flavor.")
-    launch_group.add_argument("--timeout", type=str, default="30m", help="HF Jobs timeout.")
-
-    args = parser.parse_args()
-    if args.token is None:
-        args.token = os.environ.get("HF_TOKEN")
-    return args
-
-
-def launch_job(args):
-    from huggingface_hub import run_uv_job
-
-    script_args = [
-        "--model_repo_id",
-        args.model_repo_id,
-        "--output_repo_id",
-        args.output_repo_id,
-        "--num_layers",
-        str(args.num_layers),
-    ]
-    if args.subfolder:
-        script_args.extend(["--subfolder", args.subfolder])
-    if args.shrink_dims:
-        script_args.append("--shrink_dims")
-    if args.push_to_hub:
-        script_args.append("--push_to_hub")
-
-    job = run_uv_job(
-        __file__,
-        script_args=script_args,
-        flavor=args.flavor,
-        timeout=args.timeout,
-        secrets={"HF_TOKEN": args.token} if args.token else {},
-    )
-    print(f"Job submitted: {job.url}")
-    print(f"Job ID: {job.id}")
-    return job
-
-
-def make_tiny_model(
-    model_repo_id, output_repo_id, subfolder=None, num_layers=2, shrink_dims=False, push_to_hub=False, token=None
-):
-    from diffusers import AutoModel
-
-    config_kwargs = {}
-    if token:
-        config_kwargs["token"] = token
-
-    config = AutoModel.load_config(model_repo_id, subfolder=subfolder, **config_kwargs)
-
-    modified_keys = {}
-    for key, value in config.items():
-        if LAYER_PARAM_PATTERN.match(key) and isinstance(value, int) and value > num_layers:
-            modified_keys[key] = (value, num_layers)
-            config[key] = num_layers
-
-    if shrink_dims:
-        for key, value in config.items():
-            if not isinstance(value, int) or key.startswith("_"):
-                continue
-            for pattern, tiny_value in DIM_PARAM_PATTERNS.items():
-                if pattern.match(key) and value > tiny_value:
-                    modified_keys[key] = (value, tiny_value)
-                    config[key] = tiny_value
-                    break
-
-    if not modified_keys:
-        print("WARNING: No config parameters were modified.")
-        print(f"Config keys: {[k for k in config if not k.startswith('_')]}")
-        return
-
-    print("Modified config parameters:")
-    for key, (old, new) in modified_keys.items():
-        print(f"  {key}: {old} -> {new}")
-
-    model = AutoModel.from_config(config)
-    total_params = sum(p.numel() for p in model.parameters())
-    print(f"Tiny model created with {total_params:,} parameters.")
-
-    save_kwargs = {}
-    if token:
-        save_kwargs["token"] = token
-    if push_to_hub:
-        save_kwargs["repo_id"] = output_repo_id
-    model.save_pretrained(output_repo_id, push_to_hub=push_to_hub, **save_kwargs)
-    if push_to_hub:
-        print(f"Model pushed to https://huggingface.co/{output_repo_id}")
-    else:
-        print(f"Model saved to {output_repo_id}")
-
-
-def main():
-    args = parse_args()
-
-    if args.launch:
-        launch_job(args)
-    else:
-        make_tiny_model(
-            model_repo_id=args.model_repo_id,
-            output_repo_id=args.output_repo_id,
-            subfolder=args.subfolder,
-            num_layers=args.num_layers,
-            shrink_dims=args.shrink_dims,
-            push_to_hub=args.push_to_hub,
-            token=args.token,
-        )
-
-
-if __name__ == "__main__":
-    main()