fix torchao typo.

Cosmos Predict2.5 14b Conversion (#12863 )
14b conversion
2025-12-24 05:14:55 +08:00 · 2025-12-23 12:56:51 +05:30 · 2025-12-22 08:02:06 -10:00 · 2025-12-22 07:14:03 -10:00
4 changed files with 65 additions and 5 deletions
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@@ -33,7 +33,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
 )
 pipeline = DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
-    quantzation_config=pipeline_quant_config,
+    quantization_config=pipeline_quant_config,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
 )
@@ -50,7 +50,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
 )
 pipeline = DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
-    quantzation_config=pipeline_quant_config,
+    quantization_config=pipeline_quant_config,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
 )
@@ -70,7 +70,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
 )
 pipeline = DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
-    quantzation_config=pipeline_quant_config,
+    quantization_config=pipeline_quant_config,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
 )
--- a/scripts/convert_cosmos_to_diffusers.py
+++ b/scripts/convert_cosmos_to_diffusers.py
@@ -29,13 +29,52 @@ hf download nvidia/Cosmos-Predict2.5-2B

 Convert checkpoint
 ```bash
+# pre-trained
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt

 python scripts/convert_cosmos_to_diffusers.py \
    --transformer_type Cosmos-2.5-Predict-Base-2B \
    --transformer_ckpt_path $transformer_ckpt_path \
    --vae_type wan2.1 \
-    --output_path converted/cosmos-p2.5-base-2b \
+    --output_path converted/2b/d20b7120-df3e-4911-919d-db6e08bad31c \
+    --save_pipeline
+
+# post-trained
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Predict-Base-2B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/2b/81edfebe-bd6a-4039-8c1d-737df1a790bf \
+    --save_pipeline
+```
+
+## 14B
+
+```bash
+hf download nvidia/Cosmos-Predict2.5-14B
+```
+
+```bash
+# pre-trained
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/pre-trained/54937b8c-29de-4f04-862c-e67b04ec41e8_ema_bf16.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Predict-Base-14B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/14b/54937b8c-29de-4f04-862c-e67b04ec41e8/ \
+    --save_pipeline
+
+# post-trained
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Predict-Base-14B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/14b/e21d2a49-4747-44c8-ba44-9f6f9243715f/ \
    --save_pipeline
 ```

@@ -298,6 +337,25 @@ TRANSFORMER_CONFIGS = {
        "crossattn_proj_in_channels": 100352,
        "encoder_hidden_states_channels": 1024,
    },
+    "Cosmos-2.5-Predict-Base-14B": {
+        "in_channels": 16 + 1,
+        "out_channels": 16,
+        "num_attention_heads": 40,
+        "attention_head_dim": 128,
+        "num_layers": 36,
+        "mlp_ratio": 4.0,
+        "text_embed_dim": 1024,
+        "adaln_lora_dim": 256,
+        "max_size": (128, 240, 240),
+        "patch_size": (1, 2, 2),
+        "rope_scale": (1.0, 3.0, 3.0),
+        "concat_padding_mask": True,
+        # NOTE: source config has pos_emb_learnable: 'True' - but params are missing
+        "extra_pos_embed_type": None,
+        "use_crossattn_projection": True,
+        "crossattn_proj_in_channels": 100352,
+        "encoder_hidden_states_channels": 1024,
+    },
 }

 VAE_KEYS_RENAME_DICT = {
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -73,6 +73,7 @@ from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
 from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
 from .lumina import LuminaPipeline
 from .lumina2 import Lumina2Pipeline
+from .ovis_image import OvisImagePipeline
 from .pag import (
    HunyuanDiTPAGPipeline,
    PixArtSigmaPAGPipeline,
@@ -164,6 +165,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("qwenimage", QwenImagePipeline),
        ("qwenimage-controlnet", QwenImageControlNetPipeline),
        ("z-image", ZImagePipeline),
+        ("ovis", OvisImagePipeline),
    ]
 )

--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -133,7 +133,7 @@ EXAMPLE_DOC_STRING = """
        ...     num_frames=93,
        ...     generator=torch.Generator().manual_seed(1),
        ... ).frames[0]
-        >>> # export_to_video(video, "image2world.mp4", fps=16)
+        >>> export_to_video(video, "image2world.mp4", fps=16)

        >>> # Video2World: condition on an input clip and predict a 93-frame world video.
        >>> prompt = (
Author	SHA1	Message	Date
sayakpaul	e4fc2a138d	fix torchao typo.	2025-12-23 12:56:51 +05:30
Miguel Martin	973a077c6a	Cosmos Predict2.5 14b Conversion (#12863 ) 14b conversion	2025-12-22 08:02:06 -10:00
Alvaro Bartolome	0c4f6c9cff	Add `OvisImagePipeline` in `AUTO_TEXT2IMAGE_PIPELINES_MAPPING` (#12876 )	2025-12-22 07:14:03 -10:00