Merge branch 'main' into use-fixture-modular-tests

remove unneeded test.
use fixture for tmp_path in modular tests.
2026-03-02 14:50:38 +08:00 · 2026-02-27 15:33:17 +05:30 · 2026-02-27 15:30:23 +05:30 · 2026-02-27 15:29:41 +05:30 · 2026-02-27 15:02:17 +05:30 · 2026-02-27 14:59:58 +05:30
6 changed files with 115 additions and 44 deletions
--- a/src/diffusers/models/attention_dispatch.py
+++ b/src/diffusers/models/attention_dispatch.py
@@ -733,7 +733,7 @@ def _wrapped_flash_attn_3(
 ) -> tuple[torch.Tensor, torch.Tensor]:
    # Hardcoded for now because pytorch does not support tuple/int type hints
    window_size = (-1, -1)
-    out, lse, *_ = flash_attn_3_func(
+    result = flash_attn_3_func(
        q=q,
        k=k,
        v=v,
@@ -750,7 +750,9 @@ def _wrapped_flash_attn_3(
        pack_gqa=pack_gqa,
        deterministic=deterministic,
        sm_margin=sm_margin,
+        return_attn_probs=True,
    )
+    out, lse, *_ = result
    lse = lse.permute(0, 2, 1)
    return out, lse

@@ -2701,7 +2703,7 @@ def _flash_varlen_attention_3(
    key_packed = torch.cat(key_valid, dim=0)
    value_packed = torch.cat(value_valid, dim=0)

-    out, lse, *_ = flash_attn_3_varlen_func(
+    result = flash_attn_3_varlen_func(
        q=query_packed,
        k=key_packed,
        v=value_packed,
@@ -2711,7 +2713,13 @@ def _flash_varlen_attention_3(
        max_seqlen_k=max_seqlen_k,
        softmax_scale=scale,
        causal=is_causal,
+        return_attn_probs=return_lse,
    )
+    if isinstance(result, tuple):
+        out, lse, *_ = result
+    else:
+        out = result
+        lse = None
    out = out.unflatten(0, (batch_size, -1))

    return (out, lse) if return_lse else out
--- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
+++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py
@@ -699,9 +699,13 @@ class LTX2ImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraL
        mask_shape = (batch_size, 1, num_frames, height, width)

        if latents is not None:
-            conditioning_mask = latents.new_zeros(mask_shape)
-            conditioning_mask[:, :, 0] = 1.0
            if latents.ndim == 5:
+                # conditioning_mask needs to the same shape as latents in two stages generation.
+                batch_size, _, num_frames, height, width = latents.shape
+                mask_shape = (batch_size, 1, num_frames, height, width)
+                conditioning_mask = latents.new_zeros(mask_shape)
+                conditioning_mask[:, :, 0] = 1.0
+
                latents = self._normalize_latents(
                    latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
                )
@@ -710,6 +714,9 @@ class LTX2ImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTX2LoraL
                latents = self._pack_latents(
                    latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
                )
+            else:
+                conditioning_mask = latents.new_zeros(mask_shape)
+                conditioning_mask[:, :, 0] = 1.0
            conditioning_mask = self._pack_latents(
                conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
            ).squeeze(-1)
--- a/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
+++ b/tests/modular_pipelines/flux/test_modular_pipeline_flux.py
@@ -14,7 +14,6 @@
 # limitations under the License.

 import random
-import tempfile

 import numpy as np
 import PIL
@@ -129,18 +128,16 @@ class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):

        return inputs

-    def test_save_from_pretrained(self):
+    def test_save_from_pretrained(self, tmp_path):
        pipes = []
        base_pipe = self.get_pipeline().to(torch_device)
        pipes.append(base_pipe)

-        with tempfile.TemporaryDirectory() as tmpdirname:
-            base_pipe.save_pretrained(tmpdirname)
-
-            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
-            pipe.load_components(torch_dtype=torch.float32)
-            pipe.to(torch_device)
-            pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)
+        base_pipe.save_pretrained(tmp_path)
+        pipe = ModularPipeline.from_pretrained(tmp_path).to(torch_device)
+        pipe.load_components(torch_dtype=torch.float32)
+        pipe.to(torch_device)
+        pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)

        pipes.append(pipe)

@@ -212,18 +209,16 @@ class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):

        return inputs

-    def test_save_from_pretrained(self):
+    def test_save_from_pretrained(self, tmp_path):
        pipes = []
        base_pipe = self.get_pipeline().to(torch_device)
        pipes.append(base_pipe)

-        with tempfile.TemporaryDirectory() as tmpdirname:
-            base_pipe.save_pretrained(tmpdirname)
-
-            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
-            pipe.load_components(torch_dtype=torch.float32)
-            pipe.to(torch_device)
-            pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)
+        base_pipe.save_pretrained(tmp_path)
+        pipe = ModularPipeline.from_pretrained(tmp_path).to(torch_device)
+        pipe.load_components(torch_dtype=torch.float32)
+        pipe.to(torch_device)
+        pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)

        pipes.append(pipe)

--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -1,5 +1,4 @@
 import gc
-import tempfile
 from typing import Callable

 import pytest
@@ -328,16 +327,15 @@ class ModularPipelineTesterMixin:

        assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3

-    def test_save_from_pretrained(self):
+    def test_save_from_pretrained(self, tmp_path):
        pipes = []
        base_pipe = self.get_pipeline().to(torch_device)
        pipes.append(base_pipe)

-        with tempfile.TemporaryDirectory() as tmpdirname:
-            base_pipe.save_pretrained(tmpdirname)
-            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
-            pipe.load_components(torch_dtype=torch.float32)
-            pipe.to(torch_device)
+        base_pipe.save_pretrained(tmp_path)
+        pipe = ModularPipeline.from_pretrained(tmp_path).to(torch_device)
+        pipe.load_components(torch_dtype=torch.float32)
+        pipe.to(torch_device)

        pipes.append(pipe)

--- a/tests/modular_pipelines/test_modular_pipelines_custom_blocks.py
+++ b/tests/modular_pipelines/test_modular_pipelines_custom_blocks.py
@@ -14,7 +14,6 @@

 import json
 import os
-import tempfile
 from collections import deque
 from typing import List

@@ -153,25 +152,24 @@ class TestModularCustomBlocks:
        output_prompt = output.values["output_prompt"]
        assert output_prompt.startswith("Modular diffusers + ")

-    def test_custom_block_saving_loading(self):
+    def test_custom_block_saving_loading(self, tmp_path):
        custom_block = DummyCustomBlockSimple()

-        with tempfile.TemporaryDirectory() as tmpdir:
-            custom_block.save_pretrained(tmpdir)
-            assert any("modular_config.json" in k for k in os.listdir(tmpdir))
+        custom_block.save_pretrained(tmp_path)
+        assert any("modular_config.json" in k for k in os.listdir(tmp_path))

-            with open(os.path.join(tmpdir, "modular_config.json"), "r") as f:
-                config = json.load(f)
-            auto_map = config["auto_map"]
-            assert auto_map == {"ModularPipelineBlocks": "test_modular_pipelines_custom_blocks.DummyCustomBlockSimple"}
+        with open(os.path.join(tmp_path, "modular_config.json"), "r") as f:
+            config = json.load(f)
+        auto_map = config["auto_map"]
+        assert auto_map == {"ModularPipelineBlocks": "test_modular_pipelines_custom_blocks.DummyCustomBlockSimple"}

-            # For now, the Python script that implements the custom block has to be manually pushed to the Hub.
-            # This is why, we have to separately save the Python script here.
-            code_path = os.path.join(tmpdir, "test_modular_pipelines_custom_blocks.py")
-            with open(code_path, "w") as f:
-                f.write(CODE_STR)
+        # For now, the Python script that implements the custom block has to be manually pushed to the Hub.
+        # This is why, we have to separately save the Python script here.
+        code_path = os.path.join(tmp_path, "test_modular_pipelines_custom_blocks.py")
+        with open(code_path, "w") as f:
+            f.write(CODE_STR)

-            loaded_custom_block = ModularPipelineBlocks.from_pretrained(tmpdir, trust_remote_code=True)
+        loaded_custom_block = ModularPipelineBlocks.from_pretrained(tmp_path, trust_remote_code=True)

        pipe = loaded_custom_block.init_pipeline()
        prompt = "Diffusers is nice"
--- a/tests/pipelines/ltx2/test_ltx2_image2video.py
+++ b/tests/pipelines/ltx2/test_ltx2_image2video.py
@@ -24,7 +24,8 @@ from diffusers import (
    LTX2ImageToVideoPipeline,
    LTX2VideoTransformer3DModel,
 )
-from diffusers.pipelines.ltx2 import LTX2TextConnectors
+from diffusers.pipelines.ltx2 import LTX2LatentUpsamplePipeline, LTX2TextConnectors
+from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
 from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder

 from ...testing_utils import enable_full_determinism
@@ -174,6 +175,15 @@ class LTX2ImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

        return components

+    def get_dummy_upsample_component(self, in_channels=4, mid_channels=32, num_blocks_per_stage=1):
+        upsampler = LTX2LatentUpsamplerModel(
+            in_channels=in_channels,
+            mid_channels=mid_channels,
+            num_blocks_per_stage=num_blocks_per_stage,
+        )
+
+        return upsampler
+
    def get_dummy_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
@@ -287,5 +297,60 @@ class LTX2ImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        assert torch.allclose(expected_video_slice, generated_video_slice, atol=1e-4, rtol=1e-4)
        assert torch.allclose(expected_audio_slice, generated_audio_slice, atol=1e-4, rtol=1e-4)

+    def test_two_stages_inference_with_upsampler(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "latent"
+        first_stage_output = pipe(**inputs)
+        video_latent = first_stage_output.frames
+        audio_latent = first_stage_output.audio
+
+        self.assertEqual(video_latent.shape, (1, 4, 3, 16, 16))
+        self.assertEqual(audio_latent.shape, (1, 2, 5, 2))
+        self.assertEqual(audio_latent.shape[1], components["vocoder"].config.out_channels)
+
+        upsampler = self.get_dummy_upsample_component(in_channels=video_latent.shape[1])
+        upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=upsampler)
+        upscaled_video_latent = upsample_pipe(latents=video_latent, output_type="latent", return_dict=False)[0]
+        self.assertEqual(upscaled_video_latent.shape, (1, 4, 3, 32, 32))
+
+        inputs["latents"] = upscaled_video_latent
+        inputs["audio_latents"] = audio_latent
+        inputs["output_type"] = "pt"
+        second_stage_output = pipe(**inputs)
+        video = second_stage_output.frames
+        audio = second_stage_output.audio
+
+        self.assertEqual(video.shape, (1, 5, 3, 64, 64))
+        self.assertEqual(audio.shape[0], 1)
+        self.assertEqual(audio.shape[1], components["vocoder"].config.out_channels)
+
+        # fmt: off
+        expected_video_slice = torch.tensor(
+            [
+                0.4497, 0.6757, 0.4219, 0.7686, 0.4525, 0.6483, 0.3969, 0.7404, 0.3541, 0.3039, 0.4592, 0.3521, 0.3665, 0.2785, 0.3336, 0.3079
+            ]
+        )
+        expected_audio_slice = torch.tensor(
+            [
+                0.0271, 0.0492, 0.1249, 0.1126, 0.1661, 0.1060, 0.1717, 0.0944, 0.0672, -0.0069, 0.0688, 0.0097, 0.0808, 0.1231, 0.0986, 0.0739
+            ]
+        )
+        # fmt: on
+
+        video = video.flatten()
+        audio = audio.flatten()
+        generated_video_slice = torch.cat([video[:8], video[-8:]])
+        generated_audio_slice = torch.cat([audio[:8], audio[-8:]])
+
+        assert torch.allclose(expected_video_slice, generated_video_slice, atol=1e-4, rtol=1e-4)
+        assert torch.allclose(expected_audio_slice, generated_audio_slice, atol=1e-4, rtol=1e-4)
+
    def test_inference_batch_single_identical(self):
        self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=2e-2)
Author	SHA1	Message	Date
Sayak Paul	49f02e3791	Merge branch 'main' into use-fixture-modular-tests	2026-02-27 15:33:17 +05:30
sayakpaul	de5878117f	remove unneeded test.	2026-02-27 15:30:23 +05:30
sayakpaul	dc9190545e	use fixture for tmp_path in modular tests.	2026-02-27 15:29:41 +05:30
sayakpaul	94457fd6b1	check for compulsory keys.	2026-02-27 15:02:17 +05:30
sayakpaul	6ebd990336	add a test to check modular index consistency	2026-02-27 14:59:58 +05:30
Jerry Song	40e96454f1	Fix LTX-2 image-to-video generation failure in two stages generation (#13187 ) * Fix LTX-2 image-to-video generation failure in two stages generation In LTX-2's two-stage image-to-video generation task, specifically after the upsampling step, a shape mismatch occurs between the `latents` and the `conditioning_mask`, which causes an error in function `_create_noised_state`. Fix it by creating the `conditioning_mask` based on the shape of the `latents`. * Add unit test for LTX-2 i2v two stages inference with upsampler * Downscaling the upsampler in LTX-2 image-to-video unit test * Apply style fixes --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2026-02-27 00:55:01 -08:00
Varun Chawla	47455bd133	Fix Flash Attention 3 interface for new FA3 return format (#13173 ) * Fix Flash Attention 3 interface compatibility for new FA3 versions Newer versions of flash-attn (after Dao-AILab/flash-attention@ed20940) no longer return lse by default from flash_attn_3_func. The function now returns just the output tensor unless return_attn_probs=True is passed. Updated _wrapped_flash_attn_3 and _flash_varlen_attention_3 to pass return_attn_probs and handle both old (always tuple) and new (tensor or tuple) return formats gracefully. Fixes #12022 * Simplify _wrapped_flash_attn_3 return unpacking Since return_attn_probs=True is always passed, the result is guaranteed to be a tuple. Remove the unnecessary isinstance guard.	2026-02-26 17:34:36 +05:30