Compare commits

...

5 Commits

Author SHA1 Message Date
Sayak Paul
c61e455ce7 Merge branch 'main' into device-map-direct 2025-12-23 13:16:10 +05:30
Miguel Martin
973a077c6a Cosmos Predict2.5 14b Conversion (#12863)
14b conversion
2025-12-22 08:02:06 -10:00
Alvaro Bartolome
0c4f6c9cff Add OvisImagePipeline in AUTO_TEXT2IMAGE_PIPELINES_MAPPING (#12876) 2025-12-22 07:14:03 -10:00
Sayak Paul
6f5eb0a933 Merge branch 'main' into device-map-direct 2025-12-11 14:47:09 +08:00
sayakpaul
83ec2fb793 support device type device_maps to work with offloading. 2025-12-09 11:10:41 +05:30
4 changed files with 81 additions and 7 deletions

View File

@@ -29,13 +29,52 @@ hf download nvidia/Cosmos-Predict2.5-2B
Convert checkpoint Convert checkpoint
```bash ```bash
# pre-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \ python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-2B \ --transformer_type Cosmos-2.5-Predict-Base-2B \
--transformer_ckpt_path $transformer_ckpt_path \ --transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \ --vae_type wan2.1 \
--output_path converted/cosmos-p2.5-base-2b \ --output_path converted/2b/d20b7120-df3e-4911-919d-db6e08bad31c \
--save_pipeline
# post-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-2B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/2b/81edfebe-bd6a-4039-8c1d-737df1a790bf \
--save_pipeline
```
## 14B
```bash
hf download nvidia/Cosmos-Predict2.5-14B
```
```bash
# pre-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/pre-trained/54937b8c-29de-4f04-862c-e67b04ec41e8_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-14B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/14b/54937b8c-29de-4f04-862c-e67b04ec41e8/ \
--save_pipeline
# post-trained
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt
python scripts/convert_cosmos_to_diffusers.py \
--transformer_type Cosmos-2.5-Predict-Base-14B \
--transformer_ckpt_path $transformer_ckpt_path \
--vae_type wan2.1 \
--output_path converted/14b/e21d2a49-4747-44c8-ba44-9f6f9243715f/ \
--save_pipeline --save_pipeline
``` ```
@@ -298,6 +337,25 @@ TRANSFORMER_CONFIGS = {
"crossattn_proj_in_channels": 100352, "crossattn_proj_in_channels": 100352,
"encoder_hidden_states_channels": 1024, "encoder_hidden_states_channels": 1024,
}, },
"Cosmos-2.5-Predict-Base-14B": {
"in_channels": 16 + 1,
"out_channels": 16,
"num_attention_heads": 40,
"attention_head_dim": 128,
"num_layers": 36,
"mlp_ratio": 4.0,
"text_embed_dim": 1024,
"adaln_lora_dim": 256,
"max_size": (128, 240, 240),
"patch_size": (1, 2, 2),
"rope_scale": (1.0, 3.0, 3.0),
"concat_padding_mask": True,
# NOTE: source config has pos_emb_learnable: 'True' - but params are missing
"extra_pos_embed_type": None,
"use_crossattn_projection": True,
"crossattn_proj_in_channels": 100352,
"encoder_hidden_states_channels": 1024,
},
} }
VAE_KEYS_RENAME_DICT = { VAE_KEYS_RENAME_DICT = {

View File

@@ -73,6 +73,7 @@ from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .lumina import LuminaPipeline from .lumina import LuminaPipeline
from .lumina2 import Lumina2Pipeline from .lumina2 import Lumina2Pipeline
from .ovis_image import OvisImagePipeline
from .pag import ( from .pag import (
HunyuanDiTPAGPipeline, HunyuanDiTPAGPipeline,
PixArtSigmaPAGPipeline, PixArtSigmaPAGPipeline,
@@ -164,6 +165,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
("qwenimage", QwenImagePipeline), ("qwenimage", QwenImagePipeline),
("qwenimage-controlnet", QwenImageControlNetPipeline), ("qwenimage-controlnet", QwenImageControlNetPipeline),
("z-image", ZImagePipeline), ("z-image", ZImagePipeline),
("ovis", OvisImagePipeline),
] ]
) )

View File

@@ -133,7 +133,7 @@ EXAMPLE_DOC_STRING = """
... num_frames=93, ... num_frames=93,
... generator=torch.Generator().manual_seed(1), ... generator=torch.Generator().manual_seed(1),
... ).frames[0] ... ).frames[0]
>>> # export_to_video(video, "image2world.mp4", fps=16) >>> export_to_video(video, "image2world.mp4", fps=16)
>>> # Video2World: condition on an input clip and predict a 93-frame world video. >>> # Video2World: condition on an input clip and predict a 93-frame world video.
>>> prompt = ( >>> prompt = (

View File

@@ -109,7 +109,7 @@ LIBRARIES = []
for library in LOADABLE_CLASSES: for library in LOADABLE_CLASSES:
LIBRARIES.append(library) LIBRARIES.append(library)
SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()] SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
@@ -462,8 +462,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
pipeline_is_sequentially_offloaded = any( pipeline_is_sequentially_offloaded = any(
module_is_sequentially_offloaded(module) for _, module in self.components.items() module_is_sequentially_offloaded(module) for _, module in self.components.items()
) )
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
if is_pipeline_device_mapped: if is_pipeline_device_mapped:
raise ValueError( raise ValueError(
"It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline." "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
@@ -1164,7 +1163,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
""" """
self._maybe_raise_error_if_group_offload_active(raise_error=True) self._maybe_raise_error_if_group_offload_active(raise_error=True)
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1 is_pipeline_device_mapped = self._is_pipeline_device_mapped()
if is_pipeline_device_mapped: if is_pipeline_device_mapped:
raise ValueError( raise ValueError(
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`." "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
@@ -1286,7 +1285,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
self.remove_all_hooks() self.remove_all_hooks()
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1 is_pipeline_device_mapped = self._is_pipeline_device_mapped()
if is_pipeline_device_mapped: if is_pipeline_device_mapped:
raise ValueError( raise ValueError(
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`." "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
@@ -2171,6 +2170,21 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
return True return True
return False return False
def _is_pipeline_device_mapped(self):
# We support passing `device_map="cuda"`, for example. This is helpful, in case
# users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
# in limited VRAM environments because quantized models often initialize directly on the accelerator.
device_map = self.hf_device_map
is_device_type_map = False
if isinstance(device_map, str):
try:
torch.device(device_map)
is_device_type_map = True
except RuntimeError:
pass
return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
class StableDiffusionMixin: class StableDiffusionMixin:
r""" r"""