Compare commits

..

1 Commits

Author SHA1 Message Date
sayakpaul
d957cd816d up 2025-12-22 18:39:27 +05:30
8 changed files with 47 additions and 105 deletions

View File

@@ -21,8 +21,8 @@ from transformers import (
BertModel, BertModel,
BertTokenizer, BertTokenizer,
CLIPImageProcessor, CLIPImageProcessor,
MT5Tokenizer,
T5EncoderModel, T5EncoderModel,
T5Tokenizer,
) )
from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
@@ -260,7 +260,7 @@ class HunyuanDiTDifferentialImg2ImgPipeline(DiffusionPipeline):
The HunyuanDiT model designed by Tencent Hunyuan. The HunyuanDiT model designed by Tencent Hunyuan.
text_encoder_2 (`T5EncoderModel`): text_encoder_2 (`T5EncoderModel`):
The mT5 embedder. Specifically, it is 't5-v1_1-xxl'. The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
tokenizer_2 (`T5Tokenizer`): tokenizer_2 (`MT5Tokenizer`):
The tokenizer for the mT5 embedder. The tokenizer for the mT5 embedder.
scheduler ([`DDPMScheduler`]): scheduler ([`DDPMScheduler`]):
A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents. A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
@@ -295,7 +295,7 @@ class HunyuanDiTDifferentialImg2ImgPipeline(DiffusionPipeline):
feature_extractor: CLIPImageProcessor, feature_extractor: CLIPImageProcessor,
requires_safety_checker: bool = True, requires_safety_checker: bool = True,
text_encoder_2=T5EncoderModel, text_encoder_2=T5EncoderModel,
tokenizer_2=T5Tokenizer, tokenizer_2=MT5Tokenizer,
): ):
super().__init__() super().__init__()

View File

@@ -29,52 +29,13 @@ hf download nvidia/Cosmos-Predict2.5-2B
Convert checkpoint Convert checkpoint
```bash ```bash
# pre-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \ python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-2B \ --transformer_type Cosmos-2.5-Predict-Base-2B \
--transformer_ckpt_path $transformer_ckpt_path \ --transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \ --vae_type wan2.1 \
--output_path converted/2b/d20b7120-df3e-4911-919d-db6e08bad31c \ --output_path converted/cosmos-p2.5-base-2b \
--save_pipeline
# post-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-2B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/2b/81edfebe-bd6a-4039-8c1d-737df1a790bf \
--save_pipeline
```
## 14B
```bash
hf download nvidia/Cosmos-Predict2.5-14B
```
```bash
# pre-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/pre-trained/54937b8c-29de-4f04-862c-e67b04ec41e8_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-14B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/14b/54937b8c-29de-4f04-862c-e67b04ec41e8/ \
--save_pipeline
# post-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-14B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/14b/e21d2a49-4747-44c8-ba44-9f6f9243715f/ \
--save_pipeline --save_pipeline
``` ```
@@ -337,25 +298,6 @@ TRANSFORMER_CONFIGS = {
"crossattn_proj_in_channels": 100352, "crossattn_proj_in_channels": 100352,
"encoder_hidden_states_channels": 1024, "encoder_hidden_states_channels": 1024,
}, },
"Cosmos-2.5-Predict-Base-14B": {
"in_channels": 16 + 1,
"out_channels": 16,
"num_attention_heads": 40,
"attention_head_dim": 128,
"num_layers": 36,
"mlp_ratio": 4.0,
"text_embed_dim": 1024,
"adaln_lora_dim": 256,
"max_size": (128, 240, 240),
"patch_size": (1, 2, 2),
"rope_scale": (1.0, 3.0, 3.0),
"concat_padding_mask": True,
# NOTE: source config has pos_emb_learnable: 'True' - but params are missing
"extra_pos_embed_type": None,
"use_crossattn_projection": True,
"crossattn_proj_in_channels": 100352,
"encoder_hidden_states_channels": 1024,
},
} }
VAE_KEYS_RENAME_DICT = { VAE_KEYS_RENAME_DICT = {

View File

@@ -73,7 +73,6 @@ from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .lumina import LuminaPipeline from .lumina import LuminaPipeline
from .lumina2 import Lumina2Pipeline from .lumina2 import Lumina2Pipeline
from .ovis_image import OvisImagePipeline
from .pag import ( from .pag import (
HunyuanDiTPAGPipeline, HunyuanDiTPAGPipeline,
PixArtSigmaPAGPipeline, PixArtSigmaPAGPipeline,
@@ -165,7 +164,6 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
("qwenimage", QwenImagePipeline), ("qwenimage", QwenImagePipeline),
("qwenimage-controlnet", QwenImageControlNetPipeline), ("qwenimage-controlnet", QwenImageControlNetPipeline),
("z-image", ZImagePipeline), ("z-image", ZImagePipeline),
("ovis", OvisImagePipeline),
] ]
) )

View File

@@ -17,7 +17,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import torch import torch
from transformers import BertModel, BertTokenizer, CLIPImageProcessor, T5EncoderModel, T5Tokenizer from transformers import BertModel, BertTokenizer, CLIPImageProcessor, MT5Tokenizer, T5EncoderModel
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
@@ -185,7 +185,7 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
The HunyuanDiT model designed by Tencent Hunyuan. The HunyuanDiT model designed by Tencent Hunyuan.
text_encoder_2 (`T5EncoderModel`): text_encoder_2 (`T5EncoderModel`):
The mT5 embedder. Specifically, it is 't5-v1_1-xxl'. The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
tokenizer_2 (`T5Tokenizer`): tokenizer_2 (`MT5Tokenizer`):
The tokenizer for the mT5 embedder. The tokenizer for the mT5 embedder.
scheduler ([`DDPMScheduler`]): scheduler ([`DDPMScheduler`]):
A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents. A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
@@ -229,7 +229,7 @@ class HunyuanDiTControlNetPipeline(DiffusionPipeline):
HunyuanDiT2DMultiControlNetModel, HunyuanDiT2DMultiControlNetModel,
], ],
text_encoder_2: Optional[T5EncoderModel] = None, text_encoder_2: Optional[T5EncoderModel] = None,
tokenizer_2: Optional[T5Tokenizer] = None, tokenizer_2: Optional[MT5Tokenizer] = None,
requires_safety_checker: bool = True, requires_safety_checker: bool = True,
): ):
super().__init__() super().__init__()

View File

@@ -133,7 +133,7 @@ EXAMPLE_DOC_STRING = """
... num_frames=93, ... num_frames=93,
... generator=torch.Generator().manual_seed(1), ... generator=torch.Generator().manual_seed(1),
... ).frames[0] ... ).frames[0]
>>> export_to_video(video, "image2world.mp4", fps=16) >>> # export_to_video(video, "image2world.mp4", fps=16)
>>> # Video2World: condition on an input clip and predict a 93-frame world video. >>> # Video2World: condition on an input clip and predict a 93-frame world video.
>>> prompt = ( >>> prompt = (

View File

@@ -17,7 +17,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import torch import torch
from transformers import BertModel, BertTokenizer, CLIPImageProcessor, T5EncoderModel, T5Tokenizer from transformers import BertModel, BertTokenizer, CLIPImageProcessor, MT5Tokenizer, T5EncoderModel
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
@@ -169,7 +169,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
The HunyuanDiT model designed by Tencent Hunyuan. The HunyuanDiT model designed by Tencent Hunyuan.
text_encoder_2 (`T5EncoderModel`): text_encoder_2 (`T5EncoderModel`):
The mT5 embedder. Specifically, it is 't5-v1_1-xxl'. The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
tokenizer_2 (`T5Tokenizer`): tokenizer_2 (`MT5Tokenizer`):
The tokenizer for the mT5 embedder. The tokenizer for the mT5 embedder.
scheduler ([`DDPMScheduler`]): scheduler ([`DDPMScheduler`]):
A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents. A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
@@ -204,7 +204,7 @@ class HunyuanDiTPipeline(DiffusionPipeline):
feature_extractor: CLIPImageProcessor, feature_extractor: CLIPImageProcessor,
requires_safety_checker: bool = True, requires_safety_checker: bool = True,
text_encoder_2: Optional[T5EncoderModel] = None, text_encoder_2: Optional[T5EncoderModel] = None,
tokenizer_2: Optional[T5Tokenizer] = None, tokenizer_2: Optional[MT5Tokenizer] = None,
): ):
super().__init__() super().__init__()

View File

@@ -17,7 +17,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import torch import torch
from transformers import BertModel, BertTokenizer, CLIPImageProcessor, T5EncoderModel, T5Tokenizer from transformers import BertModel, BertTokenizer, CLIPImageProcessor, MT5Tokenizer, T5EncoderModel
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
@@ -173,7 +173,7 @@ class HunyuanDiTPAGPipeline(DiffusionPipeline, PAGMixin):
The HunyuanDiT model designed by Tencent Hunyuan. The HunyuanDiT model designed by Tencent Hunyuan.
text_encoder_2 (`T5EncoderModel`): text_encoder_2 (`T5EncoderModel`):
The mT5 embedder. Specifically, it is 't5-v1_1-xxl'. The mT5 embedder. Specifically, it is 't5-v1_1-xxl'.
tokenizer_2 (`T5Tokenizer`): tokenizer_2 (`MT5Tokenizer`):
The tokenizer for the mT5 embedder. The tokenizer for the mT5 embedder.
scheduler ([`DDPMScheduler`]): scheduler ([`DDPMScheduler`]):
A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents. A scheduler to be used in combination with HunyuanDiT to denoise the encoded image latents.
@@ -208,7 +208,7 @@ class HunyuanDiTPAGPipeline(DiffusionPipeline, PAGMixin):
feature_extractor: Optional[CLIPImageProcessor] = None, feature_extractor: Optional[CLIPImageProcessor] = None,
requires_safety_checker: bool = True, requires_safety_checker: bool = True,
text_encoder_2: Optional[T5EncoderModel] = None, text_encoder_2: Optional[T5EncoderModel] = None,
tokenizer_2: Optional[T5Tokenizer] = None, tokenizer_2: Optional[MT5Tokenizer] = None,
pag_applied_layers: Union[str, List[str]] = "blocks.1", # "blocks.16.attn1", "blocks.16", "16", 16 pag_applied_layers: Union[str, List[str]] = "blocks.1", # "blocks.16.attn1", "blocks.16", "16", 16
): ):
super().__init__() super().__init__()

View File

@@ -671,44 +671,46 @@ class TorchAoSerializationTest(unittest.TestCase):
class TorchAoCompileTest(QuantCompileTests, unittest.TestCase): class TorchAoCompileTest(QuantCompileTests, unittest.TestCase):
@property @property
def quantization_config(self): def quantization_config(self):
from torchao.quantization import Int8WeightOnlyConfig
return PipelineQuantizationConfig( return PipelineQuantizationConfig(
quant_mapping={ quant_mapping={
"transformer": TorchAoConfig(quant_type="int8_weight_only"), "transformer": TorchAoConfig(Int8WeightOnlyConfig()),
}, },
) )
@unittest.skip( # @unittest.skip(
"Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work " # "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work "
"when compiling." # "when compiling."
) # )
def test_torch_compile_with_cpu_offload(self): # def test_torch_compile_with_cpu_offload(self):
# RuntimeError: _apply(): Couldn't swap Linear.weight # # RuntimeError: _apply(): Couldn't swap Linear.weight
super().test_torch_compile_with_cpu_offload() # super().test_torch_compile_with_cpu_offload()
@parameterized.expand([False, True]) # @parameterized.expand([False, True])
@unittest.skip( # @unittest.skip(
""" # """
For `use_stream=False`: # For `use_stream=False`:
- Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation # - Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation
is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure. # is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure.
For `use_stream=True`: # For `use_stream=True`:
Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO. # Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO.
""" # """
) # )
def test_torch_compile_with_group_offload_leaf(self, use_stream): # def test_torch_compile_with_group_offload_leaf(self, use_stream):
# For use_stream=False: # # For use_stream=False:
# If we run group offloading without compilation, we will see: # # If we run group offloading without compilation, we will see:
# RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0". This is no longer allowed; the devices must match. # # RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0". This is no longer allowed; the devices must match.
# When running with compilation, the error ends up being different: # # When running with compilation, the error ends up being different:
# Dynamo failed to run FX node with fake tensors: call_function <built-in function linear>(*(FakeTensor(..., device='cuda:0', size=(s0, 256), dtype=torch.bfloat16), AffineQuantizedTensor(tensor_impl=PlainAQTTensorImpl(data=FakeTensor(..., size=(1536, 256), dtype=torch.int8)... , scale=FakeTensor(..., size=(1536,), dtype=torch.bfloat16)... , zero_point=FakeTensor(..., size=(1536,), dtype=torch.int64)... , _layout=PlainLayout()), block_size=(1, 256), shape=torch.Size([1536, 256]), device=cpu, dtype=torch.bfloat16, requires_grad=False), Parameter(FakeTensor(..., device='cuda:0', size=(1536,), dtype=torch.bfloat16, # # Dynamo failed to run FX node with fake tensors: call_function <built-in function linear>(*(FakeTensor(..., device='cuda:0', size=(s0, 256), dtype=torch.bfloat16), AffineQuantizedTensor(tensor_impl=PlainAQTTensorImpl(data=FakeTensor(..., size=(1536, 256), dtype=torch.int8)... , scale=FakeTensor(..., size=(1536,), dtype=torch.bfloat16)... , zero_point=FakeTensor(..., size=(1536,), dtype=torch.int64)... , _layout=PlainLayout()), block_size=(1, 256), shape=torch.Size([1536, 256]), device=cpu, dtype=torch.bfloat16, requires_grad=False), Parameter(FakeTensor(..., device='cuda:0', size=(1536,), dtype=torch.bfloat16,
# requires_grad=True))), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices cuda:0, cpu') # # requires_grad=True))), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices cuda:0, cpu')
# Looks like something that will have to be looked into upstream. # # Looks like something that will have to be looked into upstream.
# for linear layers, weight.tensor_impl shows cuda... but: # # for linear layers, weight.tensor_impl shows cuda... but:
# weight.tensor_impl.{data,scale,zero_point}.device will be cpu # # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
# For use_stream=True: # # For use_stream=True:
# NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={} # # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
super()._test_torch_compile_with_group_offload_leaf(use_stream=use_stream) # super()._test_torch_compile_with_group_offload_leaf(use_stream=use_stream)
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners