Compare commits

..

1 Commits

Author SHA1 Message Date
Álvaro Somoza
3833ca425f initial 2026-01-10 00:29:41 -03:00
5 changed files with 33 additions and 30 deletions

View File

@@ -29,7 +29,7 @@ Cache methods speedup diffusion transformers by storing and reusing intermediate
[[autodoc]] apply_faster_cache
## FirstBlockCacheConfig
### FirstBlockCacheConfig
[[autodoc]] FirstBlockCacheConfig

View File

@@ -68,20 +68,6 @@ config = FasterCacheConfig(
pipeline.transformer.enable_cache(config)
```
## FirstBlockCache
[FirstBlock Cache](https://huggingface.co/docs/diffusers/main/en/api/cache#diffusers.FirstBlockCacheConfig) checks how much the early layers of the denoiser changes from one timestep to the next. If the change is small, the model skips the expensive later layers and reuses the previous output.
```py
import torch
from diffusers import DiffusionPipeline
from diffusers.hooks import apply_first_block_cache, FirstBlockCacheConfig
pipeline = DiffusionPipeline.from_pretrained(
"Qwen/Qwen-Image", torch_dtype=torch.bfloat16
)
apply_first_block_cache(pipeline.transformer, FirstBlockCacheConfig(threshold=0.2))
```
## TaylorSeer Cache
[TaylorSeer Cache](https://huggingface.co/papers/2403.06923) accelerates diffusion inference by using Taylor series expansions to approximate and cache intermediate activations across denoising steps. The method predicts future outputs based on past computations, reusing them at specified intervals to reduce redundant calculations.
@@ -101,7 +87,8 @@ from diffusers import FluxPipeline, TaylorSeerCacheConfig
pipe = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev",
torch_dtype=torch.bfloat16,
).to("cuda")
)
pipe.to("cuda")
config = TaylorSeerCacheConfig(
cache_interval=5,
@@ -110,4 +97,4 @@ config = TaylorSeerCacheConfig(
taylor_factors_dtype=torch.bfloat16,
)
pipe.transformer.enable_cache(config)
```
```

View File

@@ -41,11 +41,9 @@ class CacheMixin:
Enable caching techniques on the model.
Args:
config (`Union[PyramidAttentionBroadcastConfig, FasterCacheConfig, FirstBlockCacheConfig]`):
config (`Union[PyramidAttentionBroadcastConfig]`):
The configuration for applying the caching technique. Currently supported caching techniques are:
- [`~hooks.PyramidAttentionBroadcastConfig`]
- [`~hooks.FasterCacheConfig`]
- [`~hooks.FirstBlockCacheConfig`]
Example:

View File

@@ -68,6 +68,10 @@ class MellonParam:
def image_latents(cls, display: str = "input") -> "MellonParam":
return cls(name="image_latents", label="Image Latents", type="latents", display=display)
@classmethod
def first_frame_latents(cls, display: str = "input") -> "MellonParam":
return cls(name="first_frame_latents", label="First Frame Latents", type="latents", display=display)
@classmethod
def image_latents_with_strength(cls) -> "MellonParam":
return cls(
@@ -89,6 +93,10 @@ class MellonParam:
def embeddings(cls, display: str = "output") -> "MellonParam":
return cls(name="embeddings", label="Text Embeddings", type="embeddings", display=display)
@classmethod
def image_embeds(cls, display: str = "output") -> "MellonParam":
return cls(name="image_embeds", label="Image Embeddings", type="image_embeds", display=display)
@classmethod
def controlnet_conditioning_scale(cls, default: float = 0.5) -> "MellonParam":
return cls(
@@ -186,6 +194,16 @@ class MellonParam:
"""
return cls(name="vae", label="VAE", type="diffusers_auto_model", display="input")
@classmethod
def image_encoder(cls) -> "MellonParam":
"""
Image Encoder model info dict.
Contains keys like 'model_id', 'repo_id', 'execution_device' etc. Use components.get_one(model_id) to retrieve
the actual model.
"""
return cls(name="image_encoder", label="Image Encoder", type="diffusers_auto_model", display="input")
@classmethod
def unet(cls) -> "MellonParam":
"""

View File

@@ -84,7 +84,7 @@ class WanImage2VideoImageEncoderStep(SequentialPipelineBlocks):
class WanImage2VideoVaeImageEncoderStep(SequentialPipelineBlocks):
model_name = "wan"
block_classes = [WanImageResizeStep, WanVaeImageEncoderStep]
block_names = ["image_resize", "vae_image_encoder"]
block_names = ["image_resize", "vae_encoder"]
@property
def description(self):
@@ -142,7 +142,7 @@ class WanFLF2VImageEncoderStep(SequentialPipelineBlocks):
class WanFLF2VVaeImageEncoderStep(SequentialPipelineBlocks):
model_name = "wan"
block_classes = [WanImageResizeStep, WanImageCropResizeStep, WanFirstLastFrameVaeImageEncoderStep]
block_names = ["image_resize", "last_image_resize", "vae_image_encoder"]
block_names = ["image_resize", "last_image_resize", "vae_encoder"]
@property
def description(self):
@@ -203,7 +203,7 @@ class WanAutoImageEncoderStep(AutoPipelineBlocks):
## vae encoder
class WanAutoVaeImageEncoderStep(AutoPipelineBlocks):
block_classes = [WanFLF2VVaeImageEncoderStep, WanImage2VideoVaeImageEncoderStep]
block_names = ["flf2v_vae_image_encoder", "image2video_vae_image_encoder"]
block_names = ["flf2v_vae_encoder", "image2video_vae_encoder"]
block_trigger_inputs = ["last_image", "image"]
@property
@@ -251,7 +251,7 @@ class WanAutoBlocks(SequentialPipelineBlocks):
block_names = [
"text_encoder",
"image_encoder",
"vae_image_encoder",
"vae_encoder",
"denoise",
"decode",
]
@@ -353,7 +353,7 @@ class Wan22AutoBlocks(SequentialPipelineBlocks):
]
block_names = [
"text_encoder",
"vae_image_encoder",
"vae_encoder",
"denoise",
"decode",
]
@@ -384,7 +384,7 @@ IMAGE2VIDEO_BLOCKS = InsertableDict(
[
("image_resize", WanImageResizeStep),
("image_encoder", WanImage2VideoImageEncoderStep),
("vae_image_encoder", WanImage2VideoVaeImageEncoderStep),
("vae_encoder", WanImage2VideoVaeImageEncoderStep),
("input", WanTextInputStep),
("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_frame_latents"])),
("set_timesteps", WanSetTimestepsStep),
@@ -401,7 +401,7 @@ FLF2V_BLOCKS = InsertableDict(
("image_resize", WanImageResizeStep),
("last_image_resize", WanImageCropResizeStep),
("image_encoder", WanFLF2VImageEncoderStep),
("vae_image_encoder", WanFLF2VVaeImageEncoderStep),
("vae_encoder", WanFLF2VVaeImageEncoderStep),
("input", WanTextInputStep),
("additional_inputs", WanAdditionalInputsStep(image_latent_inputs=["first_last_frame_latents"])),
("set_timesteps", WanSetTimestepsStep),
@@ -416,7 +416,7 @@ AUTO_BLOCKS = InsertableDict(
[
("text_encoder", WanTextEncoderStep),
("image_encoder", WanAutoImageEncoderStep),
("vae_image_encoder", WanAutoVaeImageEncoderStep),
("vae_encoder", WanAutoVaeImageEncoderStep),
("denoise", WanAutoDenoiseStep),
("decode", WanImageVaeDecoderStep),
]
@@ -438,7 +438,7 @@ TEXT2VIDEO_BLOCKS_WAN22 = InsertableDict(
IMAGE2VIDEO_BLOCKS_WAN22 = InsertableDict(
[
("image_resize", WanImageResizeStep),
("vae_image_encoder", WanImage2VideoVaeImageEncoderStep),
("vae_encoder", WanImage2VideoVaeImageEncoderStep),
("input", WanTextInputStep),
("set_timesteps", WanSetTimestepsStep),
("prepare_latents", WanPrepareLatentsStep),
@@ -450,7 +450,7 @@ IMAGE2VIDEO_BLOCKS_WAN22 = InsertableDict(
AUTO_BLOCKS_WAN22 = InsertableDict(
[
("text_encoder", WanTextEncoderStep),
("vae_image_encoder", WanAutoVaeImageEncoderStep),
("vae_encoder", WanAutoVaeImageEncoderStep),
("denoise", Wan22AutoDenoiseStep),
("decode", WanImageVaeDecoderStep),
]