Compare commits

..

1 Commits

Author SHA1 Message Date
sayakpaul
d957cd816d up 2025-12-22 18:39:27 +05:30
4 changed files with 35 additions and 93 deletions

View File

@@ -29,52 +29,13 @@ hf download nvidia/Cosmos-Predict2.5-2B
Convert checkpoint Convert checkpoint
```bash ```bash
# pre-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \ python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-2B \ --transformer_type Cosmos-2.5-Predict-Base-2B \
--transformer_ckpt_path $transformer_ckpt_path \ --transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \ --vae_type wan2.1 \
--output_path converted/2b/d20b7120-df3e-4911-919d-db6e08bad31c \ --output_path converted/cosmos-p2.5-base-2b \
--save_pipeline
# post-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-2B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/2b/81edfebe-bd6a-4039-8c1d-737df1a790bf \
--save_pipeline
```
## 14B
```bash
hf download nvidia/Cosmos-Predict2.5-14B
```
```bash
# pre-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/pre-trained/54937b8c-29de-4f04-862c-e67b04ec41e8_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-14B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/14b/54937b8c-29de-4f04-862c-e67b04ec41e8/ \
--save_pipeline
# post-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-14B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/14b/e21d2a49-4747-44c8-ba44-9f6f9243715f/ \
--save_pipeline --save_pipeline
``` ```
@@ -337,25 +298,6 @@ TRANSFORMER_CONFIGS = {
"crossattn_proj_in_channels": 100352, "crossattn_proj_in_channels": 100352,
"encoder_hidden_states_channels": 1024, "encoder_hidden_states_channels": 1024,
}, },
"Cosmos-2.5-Predict-Base-14B": {
"in_channels": 16 + 1,
"out_channels": 16,
"num_attention_heads": 40,
"attention_head_dim": 128,
"num_layers": 36,
"mlp_ratio": 4.0,
"text_embed_dim": 1024,
"adaln_lora_dim": 256,
"max_size": (128, 240, 240),
"patch_size": (1, 2, 2),
"rope_scale": (1.0, 3.0, 3.0),
"concat_padding_mask": True,
# NOTE: source config has pos_emb_learnable: 'True' - but params are missing
"extra_pos_embed_type": None,
"use_crossattn_projection": True,
"crossattn_proj_in_channels": 100352,
"encoder_hidden_states_channels": 1024,
},
} }
VAE_KEYS_RENAME_DICT = { VAE_KEYS_RENAME_DICT = {

View File

@@ -73,7 +73,6 @@ from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .lumina import LuminaPipeline from .lumina import LuminaPipeline
from .lumina2 import Lumina2Pipeline from .lumina2 import Lumina2Pipeline
from .ovis_image import OvisImagePipeline
from .pag import ( from .pag import (
HunyuanDiTPAGPipeline, HunyuanDiTPAGPipeline,
PixArtSigmaPAGPipeline, PixArtSigmaPAGPipeline,
@@ -165,7 +164,6 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
("qwenimage", QwenImagePipeline), ("qwenimage", QwenImagePipeline),
("qwenimage-controlnet", QwenImageControlNetPipeline), ("qwenimage-controlnet", QwenImageControlNetPipeline),
("z-image", ZImagePipeline), ("z-image", ZImagePipeline),
("ovis", OvisImagePipeline),
] ]
) )

View File

@@ -133,7 +133,7 @@ EXAMPLE_DOC_STRING = """
... num_frames=93, ... num_frames=93,
... generator=torch.Generator().manual_seed(1), ... generator=torch.Generator().manual_seed(1),
... ).frames[0] ... ).frames[0]
>>> export_to_video(video, "image2world.mp4", fps=16) >>> # export_to_video(video, "image2world.mp4", fps=16)
>>> # Video2World: condition on an input clip and predict a 93-frame world video. >>> # Video2World: condition on an input clip and predict a 93-frame world video.
>>> prompt = ( >>> prompt = (

View File

@@ -671,44 +671,46 @@ class TorchAoSerializationTest(unittest.TestCase):
class TorchAoCompileTest(QuantCompileTests, unittest.TestCase): class TorchAoCompileTest(QuantCompileTests, unittest.TestCase):
@property @property
def quantization_config(self): def quantization_config(self):
from torchao.quantization import Int8WeightOnlyConfig
return PipelineQuantizationConfig( return PipelineQuantizationConfig(
quant_mapping={ quant_mapping={
"transformer": TorchAoConfig(quant_type="int8_weight_only"), "transformer": TorchAoConfig(Int8WeightOnlyConfig()),
}, },
) )
@unittest.skip( # @unittest.skip(
"Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work " # "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work "
"when compiling." # "when compiling."
) # )
def test_torch_compile_with_cpu_offload(self): # def test_torch_compile_with_cpu_offload(self):
# RuntimeError: _apply(): Couldn't swap Linear.weight # # RuntimeError: _apply(): Couldn't swap Linear.weight
super().test_torch_compile_with_cpu_offload() # super().test_torch_compile_with_cpu_offload()
@parameterized.expand([False, True]) # @parameterized.expand([False, True])
@unittest.skip( # @unittest.skip(
""" # """
For `use_stream=False`: # For `use_stream=False`:
- Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation # - Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation
is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure. # is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure.
For `use_stream=True`: # For `use_stream=True`:
Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO. # Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO.
""" # """
) # )
def test_torch_compile_with_group_offload_leaf(self, use_stream): # def test_torch_compile_with_group_offload_leaf(self, use_stream):
# For use_stream=False: # # For use_stream=False:
# If we run group offloading without compilation, we will see: # # If we run group offloading without compilation, we will see:
# RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0". This is no longer allowed; the devices must match. # # RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0". This is no longer allowed; the devices must match.
# When running with compilation, the error ends up being different: # # When running with compilation, the error ends up being different:
# Dynamo failed to run FX node with fake tensors: call_function <built-in function linear>(*(FakeTensor(..., device='cuda:0', size=(s0, 256), dtype=torch.bfloat16), AffineQuantizedTensor(tensor_impl=PlainAQTTensorImpl(data=FakeTensor(..., size=(1536, 256), dtype=torch.int8)... , scale=FakeTensor(..., size=(1536,), dtype=torch.bfloat16)... , zero_point=FakeTensor(..., size=(1536,), dtype=torch.int64)... , _layout=PlainLayout()), block_size=(1, 256), shape=torch.Size([1536, 256]), device=cpu, dtype=torch.bfloat16, requires_grad=False), Parameter(FakeTensor(..., device='cuda:0', size=(1536,), dtype=torch.bfloat16, # # Dynamo failed to run FX node with fake tensors: call_function <built-in function linear>(*(FakeTensor(..., device='cuda:0', size=(s0, 256), dtype=torch.bfloat16), AffineQuantizedTensor(tensor_impl=PlainAQTTensorImpl(data=FakeTensor(..., size=(1536, 256), dtype=torch.int8)... , scale=FakeTensor(..., size=(1536,), dtype=torch.bfloat16)... , zero_point=FakeTensor(..., size=(1536,), dtype=torch.int64)... , _layout=PlainLayout()), block_size=(1, 256), shape=torch.Size([1536, 256]), device=cpu, dtype=torch.bfloat16, requires_grad=False), Parameter(FakeTensor(..., device='cuda:0', size=(1536,), dtype=torch.bfloat16,
# requires_grad=True))), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices cuda:0, cpu') # # requires_grad=True))), **{}): got RuntimeError('Unhandled FakeTensor Device Propagation for aten.mm.default, found two different devices cuda:0, cpu')
# Looks like something that will have to be looked into upstream. # # Looks like something that will have to be looked into upstream.
# for linear layers, weight.tensor_impl shows cuda... but: # # for linear layers, weight.tensor_impl shows cuda... but:
# weight.tensor_impl.{data,scale,zero_point}.device will be cpu # # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
# For use_stream=True: # # For use_stream=True:
# NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={} # # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
super()._test_torch_compile_with_group_offload_leaf(use_stream=use_stream) # super()._test_torch_compile_with_group_offload_leaf(use_stream=use_stream)
# Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners