Merge branch 'main' into device-map-direct

Cosmos Predict2.5 14b Conversion (#12863 )
14b conversion
2025-12-23 21:04:56 +08:00 · 2025-12-23 13:16:10 +05:30 · 2025-12-22 08:02:06 -10:00 · 2025-12-22 07:14:03 -10:00 · 2025-12-11 14:47:09 +08:00 · 2025-12-09 11:10:41 +05:30
4 changed files with 81 additions and 7 deletions
--- a/scripts/convert_cosmos_to_diffusers.py
+++ b/scripts/convert_cosmos_to_diffusers.py
@@ -29,13 +29,52 @@ hf download nvidia/Cosmos-Predict2.5-2B
 Convert checkpoint
 ```bash
 # pre-trained
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
 python scripts/convert_cosmos_to_diffusers.py \
    --transformer_type Cosmos-2.5-Predict-Base-2B \
    --transformer_ckpt_path $transformer_ckpt_path \
    --vae_type wan2.1 \
-    --output_path converted/cosmos-p2.5-base-2b \
+    --output_path converted/2b/d20b7120-df3e-4911-919d-db6e08bad31c \
    --save_pipeline
 # post-trained
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt
 python scripts/convert_cosmos_to_diffusers.py \
    --transformer_type Cosmos-2.5-Predict-Base-2B \
    --transformer_ckpt_path $transformer_ckpt_path \
    --vae_type wan2.1 \
    --output_path converted/2b/81edfebe-bd6a-4039-8c1d-737df1a790bf \
    --save_pipeline
 ```
 ## 14B
 ```bash
 hf download nvidia/Cosmos-Predict2.5-14B
 ```
 ```bash
 # pre-trained
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/pre-trained/54937b8c-29de-4f04-862c-e67b04ec41e8_ema_bf16.pt
 python scripts/convert_cosmos_to_diffusers.py \
    --transformer_type Cosmos-2.5-Predict-Base-14B \
    --transformer_ckpt_path $transformer_ckpt_path \
    --vae_type wan2.1 \
    --output_path converted/14b/54937b8c-29de-4f04-862c-e67b04ec41e8/ \
    --save_pipeline
 # post-trained
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt
 python scripts/convert_cosmos_to_diffusers.py \
    --transformer_type Cosmos-2.5-Predict-Base-14B \
    --transformer_ckpt_path $transformer_ckpt_path \
    --vae_type wan2.1 \
    --output_path converted/14b/e21d2a49-4747-44c8-ba44-9f6f9243715f/ \
    --save_pipeline
 ```
@@ -298,6 +337,25 @@ TRANSFORMER_CONFIGS = {
        "crossattn_proj_in_channels": 100352,
        "encoder_hidden_states_channels": 1024,
    },
    "Cosmos-2.5-Predict-Base-14B": {
        "in_channels": 16 + 1,
        "out_channels": 16,
        "num_attention_heads": 40,
        "attention_head_dim": 128,
        "num_layers": 36,
        "mlp_ratio": 4.0,
        "text_embed_dim": 1024,
        "adaln_lora_dim": 256,
        "max_size": (128, 240, 240),
        "patch_size": (1, 2, 2),
        "rope_scale": (1.0, 3.0, 3.0),
        "concat_padding_mask": True,
        # NOTE: source config has pos_emb_learnable: 'True' - but params are missing
        "extra_pos_embed_type": None,
        "use_crossattn_projection": True,
        "crossattn_proj_in_channels": 100352,
        "encoder_hidden_states_channels": 1024,
    },
 }
 VAE_KEYS_RENAME_DICT = {
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -73,6 +73,7 @@ from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
 from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
 from .lumina import LuminaPipeline
 from .lumina2 import Lumina2Pipeline
 from .ovis_image import OvisImagePipeline
 from .pag import (
    HunyuanDiTPAGPipeline,
    PixArtSigmaPAGPipeline,
@@ -164,6 +165,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("qwenimage", QwenImagePipeline),
        ("qwenimage-controlnet", QwenImageControlNetPipeline),
        ("z-image", ZImagePipeline),
        ("ovis", OvisImagePipeline),
    ]
 )
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -133,7 +133,7 @@ EXAMPLE_DOC_STRING = """
        ...     num_frames=93,
        ...     generator=torch.Generator().manual_seed(1),
        ... ).frames[0]
-        >>> # export_to_video(video, "image2world.mp4", fps=16)
+        >>> export_to_video(video, "image2world.mp4", fps=16)
        >>> # Video2World: condition on an input clip and predict a 93-frame world video.
        >>> prompt = (
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -109,7 +109,7 @@ LIBRARIES = []
 for library in LOADABLE_CLASSES:
    LIBRARIES.append(library)
-SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()]
+SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]
 logger = logging.get_logger(__name__)
@@ -462,8 +462,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        pipeline_is_sequentially_offloaded = any(
            module_is_sequentially_offloaded(module) for _, module in self.components.items()
        )
-
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
@@ -1164,7 +1163,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        """
        self._maybe_raise_error_if_group_offload_active(raise_error=True)
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
@@ -1286,7 +1285,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
        self.remove_all_hooks()
-        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
+        is_pipeline_device_mapped = self._is_pipeline_device_mapped()
        if is_pipeline_device_mapped:
            raise ValueError(
                "It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
@@ -2171,6 +2170,21 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                return True
        return False
    def _is_pipeline_device_mapped(self):
        # We support passing `device_map="cuda"`, for example. This is helpful, in case
        # users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
        # in limited VRAM environments because quantized models often initialize directly on the accelerator.
        device_map = self.hf_device_map
        is_device_type_map = False
        if isinstance(device_map, str):
            try:
                torch.device(device_map)
                is_device_type_map = True
            except RuntimeError:
                pass
        return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
 class StableDiffusionMixin:
    r"""
Author	SHA1	Message	Date
Sayak Paul	c61e455ce7	Merge branch 'main' into device-map-direct	2025-12-23 13:16:10 +05:30
Miguel Martin	973a077c6a	Cosmos Predict2.5 14b Conversion (#12863 ) 14b conversion	2025-12-22 08:02:06 -10:00
Alvaro Bartolome	0c4f6c9cff	Add `OvisImagePipeline` in `AUTO_TEXT2IMAGE_PIPELINES_MAPPING` (#12876 )	2025-12-22 07:14:03 -10:00
Sayak Paul	6f5eb0a933	Merge branch 'main' into device-map-direct	2025-12-11 14:47:09 +08:00
sayakpaul	83ec2fb793	support device type device_maps to work with offloading.	2025-12-09 11:10:41 +05:30