mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-23 21:04:56 +08:00
Compare commits
5 Commits
torchao-co
...
device-map
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c61e455ce7 | ||
|
|
973a077c6a | ||
|
|
0c4f6c9cff | ||
|
|
6f5eb0a933 | ||
|
|
83ec2fb793 |
@@ -29,13 +29,52 @@ hf download nvidia/Cosmos-Predict2.5-2B
|
|||||||
|
|
||||||
Convert checkpoint
|
Convert checkpoint
|
||||||
```bash
|
```bash
|
||||||
|
# pre-trained
|
||||||
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
|
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
|
||||||
|
|
||||||
python scripts/convert_cosmos_to_diffusers.py \
|
python scripts/convert_cosmos_to_diffusers.py \
|
||||||
--transformer_type Cosmos-2.5-Predict-Base-2B \
|
--transformer_type Cosmos-2.5-Predict-Base-2B \
|
||||||
--transformer_ckpt_path $transformer_ckpt_path \
|
--transformer_ckpt_path $transformer_ckpt_path \
|
||||||
--vae_type wan2.1 \
|
--vae_type wan2.1 \
|
||||||
--output_path converted/cosmos-p2.5-base-2b \
|
--output_path converted/2b/d20b7120-df3e-4911-919d-db6e08bad31c \
|
||||||
|
--save_pipeline
|
||||||
|
|
||||||
|
# post-trained
|
||||||
|
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt
|
||||||
|
|
||||||
|
python scripts/convert_cosmos_to_diffusers.py \
|
||||||
|
--transformer_type Cosmos-2.5-Predict-Base-2B \
|
||||||
|
--transformer_ckpt_path $transformer_ckpt_path \
|
||||||
|
--vae_type wan2.1 \
|
||||||
|
--output_path converted/2b/81edfebe-bd6a-4039-8c1d-737df1a790bf \
|
||||||
|
--save_pipeline
|
||||||
|
```
|
||||||
|
|
||||||
|
## 14B
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hf download nvidia/Cosmos-Predict2.5-14B
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# pre-trained
|
||||||
|
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/pre-trained/54937b8c-29de-4f04-862c-e67b04ec41e8_ema_bf16.pt
|
||||||
|
|
||||||
|
python scripts/convert_cosmos_to_diffusers.py \
|
||||||
|
--transformer_type Cosmos-2.5-Predict-Base-14B \
|
||||||
|
--transformer_ckpt_path $transformer_ckpt_path \
|
||||||
|
--vae_type wan2.1 \
|
||||||
|
--output_path converted/14b/54937b8c-29de-4f04-862c-e67b04ec41e8/ \
|
||||||
|
--save_pipeline
|
||||||
|
|
||||||
|
# post-trained
|
||||||
|
transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/71ebf3e8af30ecfe440bf0481115975fcc052b46/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt
|
||||||
|
|
||||||
|
python scripts/convert_cosmos_to_diffusers.py \
|
||||||
|
--transformer_type Cosmos-2.5-Predict-Base-14B \
|
||||||
|
--transformer_ckpt_path $transformer_ckpt_path \
|
||||||
|
--vae_type wan2.1 \
|
||||||
|
--output_path converted/14b/e21d2a49-4747-44c8-ba44-9f6f9243715f/ \
|
||||||
--save_pipeline
|
--save_pipeline
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -298,6 +337,25 @@ TRANSFORMER_CONFIGS = {
|
|||||||
"crossattn_proj_in_channels": 100352,
|
"crossattn_proj_in_channels": 100352,
|
||||||
"encoder_hidden_states_channels": 1024,
|
"encoder_hidden_states_channels": 1024,
|
||||||
},
|
},
|
||||||
|
"Cosmos-2.5-Predict-Base-14B": {
|
||||||
|
"in_channels": 16 + 1,
|
||||||
|
"out_channels": 16,
|
||||||
|
"num_attention_heads": 40,
|
||||||
|
"attention_head_dim": 128,
|
||||||
|
"num_layers": 36,
|
||||||
|
"mlp_ratio": 4.0,
|
||||||
|
"text_embed_dim": 1024,
|
||||||
|
"adaln_lora_dim": 256,
|
||||||
|
"max_size": (128, 240, 240),
|
||||||
|
"patch_size": (1, 2, 2),
|
||||||
|
"rope_scale": (1.0, 3.0, 3.0),
|
||||||
|
"concat_padding_mask": True,
|
||||||
|
# NOTE: source config has pos_emb_learnable: 'True' - but params are missing
|
||||||
|
"extra_pos_embed_type": None,
|
||||||
|
"use_crossattn_projection": True,
|
||||||
|
"crossattn_proj_in_channels": 100352,
|
||||||
|
"encoder_hidden_states_channels": 1024,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
VAE_KEYS_RENAME_DICT = {
|
VAE_KEYS_RENAME_DICT = {
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
|
|||||||
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
|
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
|
||||||
from .lumina import LuminaPipeline
|
from .lumina import LuminaPipeline
|
||||||
from .lumina2 import Lumina2Pipeline
|
from .lumina2 import Lumina2Pipeline
|
||||||
|
from .ovis_image import OvisImagePipeline
|
||||||
from .pag import (
|
from .pag import (
|
||||||
HunyuanDiTPAGPipeline,
|
HunyuanDiTPAGPipeline,
|
||||||
PixArtSigmaPAGPipeline,
|
PixArtSigmaPAGPipeline,
|
||||||
@@ -164,6 +165,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
|
|||||||
("qwenimage", QwenImagePipeline),
|
("qwenimage", QwenImagePipeline),
|
||||||
("qwenimage-controlnet", QwenImageControlNetPipeline),
|
("qwenimage-controlnet", QwenImageControlNetPipeline),
|
||||||
("z-image", ZImagePipeline),
|
("z-image", ZImagePipeline),
|
||||||
|
("ovis", OvisImagePipeline),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ EXAMPLE_DOC_STRING = """
|
|||||||
... num_frames=93,
|
... num_frames=93,
|
||||||
... generator=torch.Generator().manual_seed(1),
|
... generator=torch.Generator().manual_seed(1),
|
||||||
... ).frames[0]
|
... ).frames[0]
|
||||||
>>> # export_to_video(video, "image2world.mp4", fps=16)
|
>>> export_to_video(video, "image2world.mp4", fps=16)
|
||||||
|
|
||||||
>>> # Video2World: condition on an input clip and predict a 93-frame world video.
|
>>> # Video2World: condition on an input clip and predict a 93-frame world video.
|
||||||
>>> prompt = (
|
>>> prompt = (
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ LIBRARIES = []
|
|||||||
for library in LOADABLE_CLASSES:
|
for library in LOADABLE_CLASSES:
|
||||||
LIBRARIES.append(library)
|
LIBRARIES.append(library)
|
||||||
|
|
||||||
SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device()]
|
SUPPORTED_DEVICE_MAP = ["balanced"] + [get_device(), "cpu"]
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
@@ -462,8 +462,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
|||||||
pipeline_is_sequentially_offloaded = any(
|
pipeline_is_sequentially_offloaded = any(
|
||||||
module_is_sequentially_offloaded(module) for _, module in self.components.items()
|
module_is_sequentially_offloaded(module) for _, module in self.components.items()
|
||||||
)
|
)
|
||||||
|
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
|
||||||
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
|
|
||||||
if is_pipeline_device_mapped:
|
if is_pipeline_device_mapped:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
|
"It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
|
||||||
@@ -1164,7 +1163,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
|||||||
"""
|
"""
|
||||||
self._maybe_raise_error_if_group_offload_active(raise_error=True)
|
self._maybe_raise_error_if_group_offload_active(raise_error=True)
|
||||||
|
|
||||||
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
|
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
|
||||||
if is_pipeline_device_mapped:
|
if is_pipeline_device_mapped:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
|
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_model_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_model_cpu_offload()`."
|
||||||
@@ -1286,7 +1285,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
|||||||
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
|
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
|
||||||
self.remove_all_hooks()
|
self.remove_all_hooks()
|
||||||
|
|
||||||
is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
|
is_pipeline_device_mapped = self._is_pipeline_device_mapped()
|
||||||
if is_pipeline_device_mapped:
|
if is_pipeline_device_mapped:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
|
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
|
||||||
@@ -2171,6 +2170,21 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _is_pipeline_device_mapped(self):
|
||||||
|
# We support passing `device_map="cuda"`, for example. This is helpful, in case
|
||||||
|
# users want to pass `device_map="cpu"` when initializing a pipeline. This explicit declaration is desirable
|
||||||
|
# in limited VRAM environments because quantized models often initialize directly on the accelerator.
|
||||||
|
device_map = self.hf_device_map
|
||||||
|
is_device_type_map = False
|
||||||
|
if isinstance(device_map, str):
|
||||||
|
try:
|
||||||
|
torch.device(device_map)
|
||||||
|
is_device_type_map = True
|
||||||
|
except RuntimeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
|
||||||
|
|
||||||
|
|
||||||
class StableDiffusionMixin:
|
class StableDiffusionMixin:
|
||||||
r"""
|
r"""
|
||||||
|
|||||||
Reference in New Issue
Block a user