mirror of
https://github.com/huggingface/diffusers.git
synced 2025-12-18 02:14:43 +08:00
Compare commits
2 Commits
enable-tel
...
ad-decode-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
94bfe7da73 | ||
|
|
cb508450de |
@@ -396,15 +396,15 @@ class AnimateDiffPipeline(
|
|||||||
|
|
||||||
return ip_adapter_image_embeds
|
return ip_adapter_image_embeds
|
||||||
|
|
||||||
def decode_latents(self, latents, vae_batch_size: int = 16):
|
def decode_latents(self, latents, decode_chunk_size: int = 16):
|
||||||
latents = 1 / self.vae.config.scaling_factor * latents
|
latents = 1 / self.vae.config.scaling_factor * latents
|
||||||
|
|
||||||
batch_size, channels, num_frames, height, width = latents.shape
|
batch_size, channels, num_frames, height, width = latents.shape
|
||||||
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
||||||
|
|
||||||
video = []
|
video = []
|
||||||
for i in range(0, latents.shape[0], vae_batch_size):
|
for i in range(0, latents.shape[0], decode_chunk_size):
|
||||||
batch_latents = latents[i : i + vae_batch_size]
|
batch_latents = latents[i : i + decode_chunk_size]
|
||||||
batch_latents = self.vae.decode(batch_latents).sample
|
batch_latents = self.vae.decode(batch_latents).sample
|
||||||
video.append(batch_latents)
|
video.append(batch_latents)
|
||||||
|
|
||||||
@@ -582,7 +582,7 @@ class AnimateDiffPipeline(
|
|||||||
clip_skip: Optional[int] = None,
|
clip_skip: Optional[int] = None,
|
||||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||||
vae_batch_size: int = 16,
|
decode_chunk_size: int = 16,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
r"""
|
r"""
|
||||||
@@ -651,7 +651,7 @@ class AnimateDiffPipeline(
|
|||||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||||
vae_batch_size (`int`, defaults to `16`):
|
decode_chunk_size (`int`, defaults to `16`):
|
||||||
The number of frames to decode at a time when calling `decode_latents` method.
|
The number of frames to decode at a time when calling `decode_latents` method.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@@ -824,7 +824,7 @@ class AnimateDiffPipeline(
|
|||||||
if output_type == "latent":
|
if output_type == "latent":
|
||||||
video = latents
|
video = latents
|
||||||
else:
|
else:
|
||||||
video_tensor = self.decode_latents(latents, vae_batch_size)
|
video_tensor = self.decode_latents(latents, decode_chunk_size)
|
||||||
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
||||||
|
|
||||||
# 10. Offload all models
|
# 10. Offload all models
|
||||||
|
|||||||
@@ -435,15 +435,15 @@ class AnimateDiffControlNetPipeline(
|
|||||||
return ip_adapter_image_embeds
|
return ip_adapter_image_embeds
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
|
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
|
||||||
def decode_latents(self, latents, vae_batch_size: int = 16):
|
def decode_latents(self, latents, decode_chunk_size: int = 16):
|
||||||
latents = 1 / self.vae.config.scaling_factor * latents
|
latents = 1 / self.vae.config.scaling_factor * latents
|
||||||
|
|
||||||
batch_size, channels, num_frames, height, width = latents.shape
|
batch_size, channels, num_frames, height, width = latents.shape
|
||||||
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
||||||
|
|
||||||
video = []
|
video = []
|
||||||
for i in range(0, latents.shape[0], vae_batch_size):
|
for i in range(0, latents.shape[0], decode_chunk_size):
|
||||||
batch_latents = latents[i : i + vae_batch_size]
|
batch_latents = latents[i : i + decode_chunk_size]
|
||||||
batch_latents = self.vae.decode(batch_latents).sample
|
batch_latents = self.vae.decode(batch_latents).sample
|
||||||
video.append(batch_latents)
|
video.append(batch_latents)
|
||||||
|
|
||||||
@@ -728,7 +728,7 @@ class AnimateDiffControlNetPipeline(
|
|||||||
clip_skip: Optional[int] = None,
|
clip_skip: Optional[int] = None,
|
||||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||||
vae_batch_size: int = 16,
|
decode_chunk_size: int = 16,
|
||||||
):
|
):
|
||||||
r"""
|
r"""
|
||||||
The call function to the pipeline for generation.
|
The call function to the pipeline for generation.
|
||||||
@@ -1064,7 +1064,7 @@ class AnimateDiffControlNetPipeline(
|
|||||||
if output_type == "latent":
|
if output_type == "latent":
|
||||||
video = latents
|
video = latents
|
||||||
else:
|
else:
|
||||||
video_tensor = self.decode_latents(latents, vae_batch_size)
|
video_tensor = self.decode_latents(latents, decode_chunk_size)
|
||||||
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
||||||
|
|
||||||
# 10. Offload all models
|
# 10. Offload all models
|
||||||
|
|||||||
@@ -500,24 +500,24 @@ class AnimateDiffVideoToVideoPipeline(
|
|||||||
|
|
||||||
return ip_adapter_image_embeds
|
return ip_adapter_image_embeds
|
||||||
|
|
||||||
def encode_video(self, video, generator, vae_batch_size: int = 16) -> torch.Tensor:
|
def encode_video(self, video, generator, decode_chunk_size: int = 16) -> torch.Tensor:
|
||||||
latents = []
|
latents = []
|
||||||
for i in range(0, len(video), vae_batch_size):
|
for i in range(0, len(video), decode_chunk_size):
|
||||||
batch_video = video[i : i + vae_batch_size]
|
batch_video = video[i : i + decode_chunk_size]
|
||||||
batch_video = retrieve_latents(self.vae.encode(batch_video), generator=generator)
|
batch_video = retrieve_latents(self.vae.encode(batch_video), generator=generator)
|
||||||
latents.append(batch_video)
|
latents.append(batch_video)
|
||||||
return torch.cat(latents)
|
return torch.cat(latents)
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
|
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
|
||||||
def decode_latents(self, latents, vae_batch_size: int = 16):
|
def decode_latents(self, latents, decode_chunk_size: int = 16):
|
||||||
latents = 1 / self.vae.config.scaling_factor * latents
|
latents = 1 / self.vae.config.scaling_factor * latents
|
||||||
|
|
||||||
batch_size, channels, num_frames, height, width = latents.shape
|
batch_size, channels, num_frames, height, width = latents.shape
|
||||||
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
||||||
|
|
||||||
video = []
|
video = []
|
||||||
for i in range(0, latents.shape[0], vae_batch_size):
|
for i in range(0, latents.shape[0], decode_chunk_size):
|
||||||
batch_latents = latents[i : i + vae_batch_size]
|
batch_latents = latents[i : i + decode_chunk_size]
|
||||||
batch_latents = self.vae.decode(batch_latents).sample
|
batch_latents = self.vae.decode(batch_latents).sample
|
||||||
video.append(batch_latents)
|
video.append(batch_latents)
|
||||||
|
|
||||||
@@ -638,7 +638,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|||||||
device,
|
device,
|
||||||
generator,
|
generator,
|
||||||
latents=None,
|
latents=None,
|
||||||
vae_batch_size: int = 16,
|
decode_chunk_size: int = 16,
|
||||||
):
|
):
|
||||||
if latents is None:
|
if latents is None:
|
||||||
num_frames = video.shape[1]
|
num_frames = video.shape[1]
|
||||||
@@ -673,10 +673,11 @@ class AnimateDiffVideoToVideoPipeline(
|
|||||||
)
|
)
|
||||||
|
|
||||||
init_latents = [
|
init_latents = [
|
||||||
self.encode_video(video[i], generator[i], vae_batch_size).unsqueeze(0) for i in range(batch_size)
|
self.encode_video(video[i], generator[i], decode_chunk_size).unsqueeze(0)
|
||||||
|
for i in range(batch_size)
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
init_latents = [self.encode_video(vid, generator, vae_batch_size).unsqueeze(0) for vid in video]
|
init_latents = [self.encode_video(vid, generator, decode_chunk_size).unsqueeze(0) for vid in video]
|
||||||
|
|
||||||
init_latents = torch.cat(init_latents, dim=0)
|
init_latents = torch.cat(init_latents, dim=0)
|
||||||
|
|
||||||
@@ -761,7 +762,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|||||||
clip_skip: Optional[int] = None,
|
clip_skip: Optional[int] = None,
|
||||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||||
vae_batch_size: int = 16,
|
decode_chunk_size: int = 16,
|
||||||
):
|
):
|
||||||
r"""
|
r"""
|
||||||
The call function to the pipeline for generation.
|
The call function to the pipeline for generation.
|
||||||
@@ -837,7 +838,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|||||||
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
|
||||||
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
|
||||||
`._callback_tensor_inputs` attribute of your pipeline class.
|
`._callback_tensor_inputs` attribute of your pipeline class.
|
||||||
vae_batch_size (`int`, defaults to `16`):
|
decode_chunk_size (`int`, defaults to `16`):
|
||||||
The number of frames to decode at a time when calling `decode_latents` method.
|
The number of frames to decode at a time when calling `decode_latents` method.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@@ -940,7 +941,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|||||||
device=device,
|
device=device,
|
||||||
generator=generator,
|
generator=generator,
|
||||||
latents=latents,
|
latents=latents,
|
||||||
vae_batch_size=vae_batch_size,
|
decode_chunk_size=decode_chunk_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||||
@@ -1008,7 +1009,7 @@ class AnimateDiffVideoToVideoPipeline(
|
|||||||
if output_type == "latent":
|
if output_type == "latent":
|
||||||
video = latents
|
video = latents
|
||||||
else:
|
else:
|
||||||
video_tensor = self.decode_latents(latents, vae_batch_size)
|
video_tensor = self.decode_latents(latents, decode_chunk_size)
|
||||||
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
||||||
|
|
||||||
# 10. Offload all models
|
# 10. Offload all models
|
||||||
|
|||||||
@@ -407,15 +407,15 @@ class AnimateDiffPAGPipeline(
|
|||||||
return ip_adapter_image_embeds
|
return ip_adapter_image_embeds
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
|
# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents
|
||||||
def decode_latents(self, latents, vae_batch_size: int = 16):
|
def decode_latents(self, latents, decode_chunk_size: int = 16):
|
||||||
latents = 1 / self.vae.config.scaling_factor * latents
|
latents = 1 / self.vae.config.scaling_factor * latents
|
||||||
|
|
||||||
batch_size, channels, num_frames, height, width = latents.shape
|
batch_size, channels, num_frames, height, width = latents.shape
|
||||||
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
|
||||||
|
|
||||||
video = []
|
video = []
|
||||||
for i in range(0, latents.shape[0], vae_batch_size):
|
for i in range(0, latents.shape[0], decode_chunk_size):
|
||||||
batch_latents = latents[i : i + vae_batch_size]
|
batch_latents = latents[i : i + decode_chunk_size]
|
||||||
batch_latents = self.vae.decode(batch_latents).sample
|
batch_latents = self.vae.decode(batch_latents).sample
|
||||||
video.append(batch_latents)
|
video.append(batch_latents)
|
||||||
|
|
||||||
@@ -588,7 +588,7 @@ class AnimateDiffPAGPipeline(
|
|||||||
clip_skip: Optional[int] = None,
|
clip_skip: Optional[int] = None,
|
||||||
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
|
||||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||||
vae_batch_size: int = 16,
|
decode_chunk_size: int = 16,
|
||||||
pag_scale: float = 3.0,
|
pag_scale: float = 3.0,
|
||||||
pag_adaptive_scale: float = 0.0,
|
pag_adaptive_scale: float = 0.0,
|
||||||
):
|
):
|
||||||
@@ -847,7 +847,7 @@ class AnimateDiffPAGPipeline(
|
|||||||
if output_type == "latent":
|
if output_type == "latent":
|
||||||
video = latents
|
video = latents
|
||||||
else:
|
else:
|
||||||
video_tensor = self.decode_latents(latents, vae_batch_size)
|
video_tensor = self.decode_latents(latents, decode_chunk_size)
|
||||||
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
|
||||||
|
|
||||||
# 10. Offload all models
|
# 10. Offload all models
|
||||||
|
|||||||
Reference in New Issue
Block a user