diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py index f886354e33..58d65ba764 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_condition.py @@ -119,7 +119,7 @@ class LTX2VideoCondition: # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents def retrieve_latents( - encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" + encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample" ): if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": return encoder_output.latent_dist.sample(generator) @@ -148,10 +148,10 @@ def calculate_shift( # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, - num_inference_steps: Optional[int] = None, - device: Optional[Union[str, torch.device]] = None, - timesteps: Optional[List[int]] = None, - sigmas: Optional[List[float]] = None, + num_inference_steps: int | None = None, + device: str | torch.device | None = None, + timesteps: list[int] | None = None, + sigmas: list[float] | None = None, **kwargs, ): r""" @@ -166,15 +166,15 @@ def retrieve_timesteps( must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. - timesteps (`List[int]`, *optional*): + timesteps (`list[int]`, *optional*): Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, `num_inference_steps` and `sigmas` must be `None`. - sigmas (`List[float]`, *optional*): + sigmas (`list[float]`, *optional*): Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the second element is the number of inference steps. """ if timesteps is not None and sigmas is not None: @@ -306,7 +306,7 @@ class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoad def _pack_text_embeds( text_hidden_states: torch.Tensor, sequence_lengths: torch.Tensor, - device: Union[str, torch.device], + device: str | torch.device, padding_side: str = "left", scale_factor: int = 8, eps: float = 1e-6, @@ -372,18 +372,18 @@ class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoad # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._get_gemma_prompt_embeds def _get_gemma_prompt_embeds( self, - prompt: Union[str, List[str]], + prompt: str | list[str], num_videos_per_prompt: int = 1, max_sequence_length: int = 1024, scale_factor: int = 8, - device: Optional[torch.device] = None, - dtype: Optional[torch.dtype] = None, + device: torch.device | None = None, + dtype: torch.dtype | None = None, ): r""" Encodes the prompt into text encoder hidden states. Args: - prompt (`str` or `List[str]`, *optional*): + prompt (`str` or `list[str]`, *optional*): prompt to be encoded device: (`str` or `torch.device`): torch device to place the resulting embeddings on @@ -446,26 +446,26 @@ class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoad # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline.encode_prompt def encode_prompt( self, - prompt: Union[str, List[str]], - negative_prompt: Optional[Union[str, List[str]]] = None, + prompt: str | list[str], + negative_prompt: str | list[str] | None = None, do_classifier_free_guidance: bool = True, num_videos_per_prompt: int = 1, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, - prompt_attention_mask: Optional[torch.Tensor] = None, - negative_prompt_attention_mask: Optional[torch.Tensor] = None, + prompt_embeds: torch.Tensor | None = None, + negative_prompt_embeds: torch.Tensor | None = None, + prompt_attention_mask: torch.Tensor | None = None, + negative_prompt_attention_mask: torch.Tensor | None = None, max_sequence_length: int = 1024, scale_factor: int = 8, - device: Optional[torch.device] = None, - dtype: Optional[torch.dtype] = None, + device: torch.device | None = None, + dtype: torch.dtype | None = None, ): r""" Encodes the prompt into text encoder hidden states. Args: - prompt (`str` or `List[str]`, *optional*): + prompt (`str` or `list[str]`, *optional*): prompt to be encoded - negative_prompt (`str` or `List[str]`, *optional*): + negative_prompt (`str` or `list[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). @@ -644,7 +644,7 @@ class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoad @staticmethod # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._create_noised_state def _create_noised_state( - latents: torch.Tensor, noise_scale: Union[float, torch.Tensor], generator: Optional[torch.Generator] = None + latents: torch.Tensor, noise_scale: float | torch.Tensor, generator: torch.Generator | None = None ): noise = randn_tensor(latents.shape, generator=generator, device=latents.device, dtype=latents.dtype) noised_latents = noise_scale * noise + (1 - noise_scale) * latents @@ -653,7 +653,7 @@ class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoad @staticmethod # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._pack_audio_latents def _pack_audio_latents( - latents: torch.Tensor, patch_size: Optional[int] = None, patch_size_t: Optional[int] = None + latents: torch.Tensor, patch_size: int | None = None, patch_size_t: int | None = None ) -> torch.Tensor: # Audio latents shape: [B, C, L, M], where L is the latent audio length and M is the number of mel bins if patch_size is not None and patch_size_t is not None: @@ -678,8 +678,8 @@ class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoad latents: torch.Tensor, latent_length: int, num_mel_bins: int, - patch_size: Optional[int] = None, - patch_size_t: Optional[int] = None, + patch_size: int | None = None, + patch_size_t: int | None = None, ) -> torch.Tensor: # Unpacks an audio patch sequence of shape [B, S, D] into a latent spectrogram tensor of shape [B, C, L, M], # where L is the latent audio length and M is the number of mel bins. @@ -948,10 +948,10 @@ class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraLoad audio_latent_length: int = 1, # 1 is just a dummy value num_mel_bins: int = 64, noise_scale: float = 0.0, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, - generator: Optional[torch.Generator] = None, - latents: Optional[torch.Tensor] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + generator: torch.Generator | None = None, + latents: torch.Tensor | None = None, ) -> torch.Tensor: if latents is not None: if latents.ndim == 4: