update

2025-12-17 01:44:43 +08:00 · 2025-12-10 11:29:05 +05:30 · 2025-12-10 11:18:10 +05:30 · 2025-12-10 11:01:21 +05:30 · 2025-12-10 09:16:39 +05:30 · 2025-12-09 14:06:09 +05:30
15 changed files with 1882 additions and 2 deletions
--- a/src/diffusers/init.py
+++ b/src/diffusers/init.py
@@ -399,6 +399,8 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["modular_pipelines"].extend(
        [
            "Flux2AutoBlocks",
            "Flux2ModularPipeline",
            "FluxAutoBlocks",
            "FluxKontextAutoBlocks",
            "FluxKontextModularPipeline",
@@ -1091,6 +1093,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
        from .modular_pipelines import (
            Flux2AutoBlocks,
            Flux2ModularPipeline,
            FluxAutoBlocks,
            FluxKontextAutoBlocks,
            FluxKontextModularPipeline,
--- a/src/diffusers/modular_pipelines/init.py
+++ b/src/diffusers/modular_pipelines/init.py
@@ -52,6 +52,10 @@ else:
        "FluxKontextAutoBlocks",
        "FluxKontextModularPipeline",
    ]
    _import_structure["flux2"] = [
        "Flux2AutoBlocks",
        "Flux2ModularPipeline",
    ]
    _import_structure["qwenimage"] = [
        "QwenImageAutoBlocks",
        "QwenImageModularPipeline",
@@ -71,6 +75,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .components_manager import ComponentsManager
        from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
        from .flux2 import Flux2AutoBlocks, Flux2ModularPipeline
        from .modular_pipeline import (
            AutoPipelineBlocks,
            BlockState,
--- a/src/diffusers/modular_pipelines/flux2/init.py
+++ b/src/diffusers/modular_pipelines/flux2/init.py
@@ -0,0 +1,111 @@
 from typing import TYPE_CHECKING
 from ...utils import (
    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
    is_torch_available,
    is_transformers_available,
 )
 _dummy_objects = {}
 _import_structure = {}
 try:
    if not (is_transformers_available() and is_torch_available()):
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
    _import_structure["encoders"] = [
        "Flux2TextEncoderStep",
        "Flux2RemoteTextEncoderStep",
        "Flux2VaeEncoderStep",
    ]
    _import_structure["before_denoise"] = [
        "Flux2SetTimestepsStep",
        "Flux2PrepareLatentsStep",
        "Flux2RoPEInputsStep",
        "Flux2PrepareImageLatentsStep",
    ]
    _import_structure["denoise"] = [
        "Flux2LoopDenoiser",
        "Flux2LoopAfterDenoiser",
        "Flux2DenoiseLoopWrapper",
        "Flux2DenoiseStep",
    ]
    _import_structure["decoders"] = ["Flux2DecodeStep"]
    _import_structure["inputs"] = [
        "Flux2ProcessImagesInputStep",
        "Flux2TextInputStep",
    ]
    _import_structure["modular_blocks"] = [
        "ALL_BLOCKS",
        "AUTO_BLOCKS",
        "REMOTE_AUTO_BLOCKS",
        "TEXT2IMAGE_BLOCKS",
        "IMAGE_CONDITIONED_BLOCKS",
        "Flux2AutoBlocks",
        "Flux2AutoVaeEncoderStep",
        "Flux2BeforeDenoiseStep",
        "Flux2VaeEncoderSequentialStep",
    ]
    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
        from .before_denoise import (
            Flux2PrepareImageLatentsStep,
            Flux2PrepareLatentsStep,
            Flux2RoPEInputsStep,
            Flux2SetTimestepsStep,
        )
        from .decoders import Flux2DecodeStep
        from .denoise import (
            Flux2DenoiseLoopWrapper,
            Flux2DenoiseStep,
            Flux2LoopAfterDenoiser,
            Flux2LoopDenoiser,
        )
        from .encoders import (
            Flux2RemoteTextEncoderStep,
            Flux2TextEncoderStep,
            Flux2VaeEncoderStep,
        )
        from .inputs import (
            Flux2ProcessImagesInputStep,
            Flux2TextInputStep,
        )
        from .modular_blocks import (
            ALL_BLOCKS,
            AUTO_BLOCKS,
            IMAGE_CONDITIONED_BLOCKS,
            REMOTE_AUTO_BLOCKS,
            TEXT2IMAGE_BLOCKS,
            Flux2AutoBlocks,
            Flux2AutoVaeEncoderStep,
            Flux2BeforeDenoiseStep,
            Flux2VaeEncoderSequentialStep,
        )
        from .modular_pipeline import Flux2ModularPipeline
 else:
    import sys
    sys.modules[__name__] = _LazyModule(
        __name__,
        globals()["__file__"],
        _import_structure,
        module_spec=__spec__,
    )
    for name, value in _dummy_objects.items():
        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/modular_pipelines/flux2/before_denoise.py
+++ b/src/diffusers/modular_pipelines/flux2/before_denoise.py
@@ -0,0 +1,508 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
 from typing import List, Optional, Union
 import numpy as np
 import torch
 from ...models import Flux2Transformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import Flux2ModularPipeline
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def compute_empirical_mu(image_seq_len: int, num_steps: int) -> float:
    """Compute empirical mu for Flux2 timestep scheduling."""
    a1, b1 = 8.73809524e-05, 1.89833333
    a2, b2 = 0.00016927, 0.45666666
    if image_seq_len > 4300:
        mu = a2 * image_seq_len + b2
        return float(mu)
    m_200 = a2 * image_seq_len + b2
    m_10 = a1 * image_seq_len + b1
    a = (m_200 - m_10) / 190.0
    b = m_200 - 200.0 * a
    mu = a * num_steps + b
    return float(mu)
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
    scheduler,
    num_inference_steps: Optional[int] = None,
    device: Optional[Union[str, torch.device]] = None,
    timesteps: Optional[List[int]] = None,
    sigmas: Optional[List[float]] = None,
    **kwargs,
 ):
    r"""
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.
    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    """
    if timesteps is not None and sigmas is not None:
        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
    if timesteps is not None:
        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
        timesteps = scheduler.timesteps
        num_inference_steps = len(timesteps)
    elif sigmas is not None:
        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
        if not accept_sigmas:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" sigmas schedules. Please check whether you are using the correct scheduler."
            )
        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
        timesteps = scheduler.timesteps
        num_inference_steps = len(timesteps)
    else:
        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
        timesteps = scheduler.timesteps
    return timesteps, num_inference_steps
 class Flux2SetTimestepsStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
            ComponentSpec("transformer", Flux2Transformer2DModel),
        ]
    @property
    def description(self) -> str:
        return "Step that sets the scheduler's timesteps for Flux2 inference using empirical mu calculation"
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("num_inference_steps", default=50),
            InputParam("timesteps"),
            InputParam("sigmas"),
            InputParam("guidance_scale", default=4.0),
            InputParam("latents", type_hint=torch.Tensor),
            InputParam("num_images_per_prompt", default=1),
            InputParam("height", type_hint=int),
            InputParam("width", type_hint=int),
            InputParam(
                "batch_size",
                required=True,
                type_hint=int,
                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.",
            ),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for inference"),
            OutputParam(
                "num_inference_steps",
                type_hint=int,
                description="The number of denoising steps to perform at inference time",
            ),
            OutputParam("guidance", type_hint=torch.Tensor, description="Guidance scale tensor"),
        ]
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        block_state.device = components._execution_device
        scheduler = components.scheduler
        height = block_state.height or components.default_height
        width = block_state.width or components.default_width
        vae_scale_factor = components.vae_scale_factor
        latent_height = 2 * (int(height) // (vae_scale_factor * 2))
        latent_width = 2 * (int(width) // (vae_scale_factor * 2))
        image_seq_len = (latent_height // 2) * (latent_width // 2)
        num_inference_steps = block_state.num_inference_steps
        sigmas = block_state.sigmas
        timesteps = block_state.timesteps
        if timesteps is None and sigmas is None:
            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
        if hasattr(scheduler.config, "use_flow_sigmas") and scheduler.config.use_flow_sigmas:
            sigmas = None
        mu = compute_empirical_mu(image_seq_len=image_seq_len, num_steps=num_inference_steps)
        timesteps, num_inference_steps = retrieve_timesteps(
            scheduler,
            num_inference_steps,
            block_state.device,
            timesteps=timesteps,
            sigmas=sigmas,
            mu=mu,
        )
        block_state.timesteps = timesteps
        block_state.num_inference_steps = num_inference_steps
        batch_size = block_state.batch_size * block_state.num_images_per_prompt
        guidance = torch.full([1], block_state.guidance_scale, device=block_state.device, dtype=torch.float32)
        guidance = guidance.expand(batch_size)
        block_state.guidance = guidance
        components.scheduler.set_begin_index(0)
        self.set_block_state(state, block_state)
        return components, state
 class Flux2PrepareLatentsStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return []
    @property
    def description(self) -> str:
        return "Prepare latents step that prepares the initial noise latents for Flux2 text-to-image generation"
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("height", type_hint=int),
            InputParam("width", type_hint=int),
            InputParam("latents", type_hint=Optional[torch.Tensor]),
            InputParam("num_images_per_prompt", type_hint=int, default=1),
            InputParam("generator"),
            InputParam(
                "batch_size",
                required=True,
                type_hint=int,
                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.",
            ),
            InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
            ),
            OutputParam("latent_ids", type_hint=torch.Tensor, description="Position IDs for the latents (for RoPE)"),
        ]
    @staticmethod
    def check_inputs(components, block_state):
        vae_scale_factor = components.vae_scale_factor
        if (block_state.height is not None and block_state.height % (vae_scale_factor * 2) != 0) or (
            block_state.width is not None and block_state.width % (vae_scale_factor * 2) != 0
        ):
            logger.warning(
                f"`height` and `width` have to be divisible by {vae_scale_factor * 2} but are {block_state.height} and {block_state.width}."
            )
    @staticmethod
    def _prepare_latent_ids(latents: torch.Tensor):
        """
        Generates 4D position coordinates (T, H, W, L) for latent tensors.
        Args:
            latents: Latent tensor of shape (B, C, H, W)
        Returns:
            Position IDs tensor of shape (B, H*W, 4)
        """
        batch_size, _, height, width = latents.shape
        t = torch.arange(1)
        h = torch.arange(height)
        w = torch.arange(width)
        l = torch.arange(1)
        latent_ids = torch.cartesian_prod(t, h, w, l)
        latent_ids = latent_ids.unsqueeze(0).expand(batch_size, -1, -1)
        return latent_ids
    @staticmethod
    def _pack_latents(latents):
        """Pack latents: (batch_size, num_channels, height, width) -> (batch_size, height * width, num_channels)"""
        batch_size, num_channels, height, width = latents.shape
        latents = latents.reshape(batch_size, num_channels, height * width).permute(0, 2, 1)
        return latents
    @staticmethod
    def prepare_latents(
        comp,
        batch_size,
        num_channels_latents,
        height,
        width,
        dtype,
        device,
        generator,
        latents=None,
    ):
        height = 2 * (int(height) // (comp.vae_scale_factor * 2))
        width = 2 * (int(width) // (comp.vae_scale_factor * 2))
        shape = (batch_size, num_channels_latents * 4, height // 2, width // 2)
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
            )
        if latents is None:
            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
        else:
            latents = latents.to(device=device, dtype=dtype)
        return latents
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        block_state.height = block_state.height or components.default_height
        block_state.width = block_state.width or components.default_width
        block_state.device = components._execution_device
        block_state.num_channels_latents = components.num_channels_latents
        self.check_inputs(components, block_state)
        batch_size = block_state.batch_size * block_state.num_images_per_prompt
        latents = self.prepare_latents(
            components,
            batch_size,
            block_state.num_channels_latents,
            block_state.height,
            block_state.width,
            block_state.dtype,
            block_state.device,
            block_state.generator,
            block_state.latents,
        )
        latent_ids = self._prepare_latent_ids(latents)
        latent_ids = latent_ids.to(block_state.device)
        latents = self._pack_latents(latents)
        block_state.latents = latents
        block_state.latent_ids = latent_ids
        self.set_block_state(state, block_state)
        return components, state
 class Flux2RoPEInputsStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def description(self) -> str:
        return "Step that prepares the 4D RoPE position IDs for Flux2 denoising. Should be placed after text encoder and latent preparation steps."
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam(name="prompt_embeds", required=True),
            InputParam(name="latent_ids"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                name="txt_ids",
                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
                description="4D position IDs (T, H, W, L) for text tokens, used for RoPE calculation.",
            ),
            OutputParam(
                name="latent_ids",
                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
                description="4D position IDs (T, H, W, L) for image latents, used for RoPE calculation.",
            ),
        ]
    @staticmethod
    def _prepare_text_ids(x: torch.Tensor, t_coord: Optional[torch.Tensor] = None):
        """Prepare 4D position IDs for text tokens."""
        B, L, _ = x.shape
        out_ids = []
        for i in range(B):
            t = torch.arange(1) if t_coord is None else t_coord[i]
            h = torch.arange(1)
            w = torch.arange(1)
            seq_l = torch.arange(L)
            coords = torch.cartesian_prod(t, h, w, seq_l)
            out_ids.append(coords)
        return torch.stack(out_ids)
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        prompt_embeds = block_state.prompt_embeds
        device = prompt_embeds.device
        block_state.txt_ids = self._prepare_text_ids(prompt_embeds)
        block_state.txt_ids = block_state.txt_ids.to(device)
        self.set_block_state(state, block_state)
        return components, state
 class Flux2PrepareImageLatentsStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def description(self) -> str:
        return "Step that prepares image latents and their position IDs for Flux2 image conditioning."
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("image_latents", type_hint=List[torch.Tensor]),
            InputParam("batch_size", required=True, type_hint=int),
            InputParam("num_images_per_prompt", default=1, type_hint=int),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "image_latents",
                type_hint=torch.Tensor,
                description="Packed image latents for conditioning",
            ),
            OutputParam(
                "image_latent_ids",
                type_hint=torch.Tensor,
                description="Position IDs for image latents",
            ),
        ]
    @staticmethod
    def _prepare_image_ids(image_latents: List[torch.Tensor], scale: int = 10):
        """
        Generates 4D time-space coordinates (T, H, W, L) for a sequence of image latents.
        Args:
            image_latents: A list of image latent feature tensors of shape (1, C, H, W).
            scale: Factor used to define the time separation between latents.
        Returns:
            Combined coordinate tensor of shape (1, N_total, 4)
        """
        if not isinstance(image_latents, list):
            raise ValueError(f"Expected `image_latents` to be a list, got {type(image_latents)}.")
        t_coords = [scale + scale * t for t in torch.arange(0, len(image_latents))]
        t_coords = [t.view(-1) for t in t_coords]
        image_latent_ids = []
        for x, t in zip(image_latents, t_coords):
            x = x.squeeze(0)
            _, height, width = x.shape
            x_ids = torch.cartesian_prod(t, torch.arange(height), torch.arange(width), torch.arange(1))
            image_latent_ids.append(x_ids)
        image_latent_ids = torch.cat(image_latent_ids, dim=0)
        image_latent_ids = image_latent_ids.unsqueeze(0)
        return image_latent_ids
    @staticmethod
    def _pack_latents(latents):
        """Pack latents: (batch_size, num_channels, height, width) -> (batch_size, height * width, num_channels)"""
        batch_size, num_channels, height, width = latents.shape
        latents = latents.reshape(batch_size, num_channels, height * width).permute(0, 2, 1)
        return latents
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        image_latents = block_state.image_latents
        if image_latents is None:
            block_state.image_latents = None
            block_state.image_latent_ids = None
            self.set_block_state(state, block_state)
            return components, state
        device = components._execution_device
        batch_size = block_state.batch_size * block_state.num_images_per_prompt
        image_latent_ids = self._prepare_image_ids(image_latents)
        packed_latents = []
        for latent in image_latents:
            packed = self._pack_latents(latent)
            packed = packed.squeeze(0)
            packed_latents.append(packed)
        image_latents = torch.cat(packed_latents, dim=0)
        image_latents = image_latents.unsqueeze(0)
        image_latents = image_latents.repeat(batch_size, 1, 1)
        image_latent_ids = image_latent_ids.repeat(batch_size, 1, 1)
        image_latent_ids = image_latent_ids.to(device)
        block_state.image_latents = image_latents
        block_state.image_latent_ids = image_latent_ids
        self.set_block_state(state, block_state)
        return components, state
--- a/src/diffusers/modular_pipelines/flux2/decoders.py
+++ b/src/diffusers/modular_pipelines/flux2/decoders.py
@@ -0,0 +1,146 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Any, List, Tuple, Union
 import numpy as np
 import PIL
 import torch
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKLFlux2
 from ...pipelines.flux2.image_processor import Flux2ImageProcessor
 from ...utils import logging
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class Flux2DecodeStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("vae", AutoencoderKLFlux2),
            ComponentSpec(
                "image_processor",
                Flux2ImageProcessor,
                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 32}),
                default_creation_method="from_config",
            ),
        ]
    @property
    def description(self) -> str:
        return "Step that decodes the denoised latents into images using Flux2 VAE with batch norm denormalization"
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        return [
            InputParam("output_type", default="pil"),
            InputParam(
                "latents",
                required=True,
                type_hint=torch.Tensor,
                description="The denoised latents from the denoising step",
            ),
            InputParam(
                "latent_ids",
                required=True,
                type_hint=torch.Tensor,
                description="Position IDs for the latents, used for unpacking",
            ),
        ]
    @property
    def intermediate_outputs(self) -> List[str]:
        return [
            OutputParam(
                "images",
                type_hint=Union[List[PIL.Image.Image], torch.Tensor, np.ndarray],
                description="The generated images, can be a list of PIL.Image.Image, torch.Tensor or a numpy array",
            )
        ]
    @staticmethod
    def _unpack_latents_with_ids(x: torch.Tensor, x_ids: torch.Tensor) -> torch.Tensor:
        """
        Unpack latents using position IDs to scatter tokens into place.
        Args:
            x: Packed latents tensor of shape (B, seq_len, C)
            x_ids: Position IDs tensor of shape (B, seq_len, 4) with (T, H, W, L) coordinates
        Returns:
            Unpacked latents tensor of shape (B, C, H, W)
        """
        x_list = []
        for data, pos in zip(x, x_ids):
            _, ch = data.shape  # noqa: F841
            h_ids = pos[:, 1].to(torch.int64)
            w_ids = pos[:, 2].to(torch.int64)
            h = torch.max(h_ids) + 1
            w = torch.max(w_ids) + 1
            flat_ids = h_ids * w + w_ids
            out = torch.zeros((h * w, ch), device=data.device, dtype=data.dtype)
            out.scatter_(0, flat_ids.unsqueeze(1).expand(-1, ch), data)
            out = out.view(h, w, ch).permute(2, 0, 1)
            x_list.append(out)
        return torch.stack(x_list, dim=0)
    @staticmethod
    def _unpatchify_latents(latents):
        """Convert patchified latents back to regular format."""
        batch_size, num_channels_latents, height, width = latents.shape
        latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), 2, 2, height, width)
        latents = latents.permute(0, 1, 4, 2, 5, 3)
        latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), height * 2, width * 2)
        return latents
    @torch.no_grad()
    def __call__(self, components, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        vae = components.vae
        if block_state.output_type == "latent":
            block_state.images = block_state.latents
        else:
            latents = block_state.latents
            latent_ids = block_state.latent_ids
            latents = self._unpack_latents_with_ids(latents, latent_ids)
            latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype)
            latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps).to(
                latents.device, latents.dtype
            )
            latents = latents * latents_bn_std + latents_bn_mean
            latents = self._unpatchify_latents(latents)
            block_state.images = vae.decode(latents, return_dict=False)[0]
            block_state.images = components.image_processor.postprocess(
                block_state.images, output_type=block_state.output_type
            )
        self.set_block_state(state, block_state)
        return components, state
--- a/src/diffusers/modular_pipelines/flux2/denoise.py
+++ b/src/diffusers/modular_pipelines/flux2/denoise.py
@@ -0,0 +1,252 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Any, List, Tuple
 import torch
 from ...models import Flux2Transformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import is_torch_xla_available, logging
 from ..modular_pipeline import (
    BlockState,
    LoopSequentialPipelineBlocks,
    ModularPipelineBlocks,
    PipelineState,
 )
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import Flux2ModularPipeline
 if is_torch_xla_available():
    import torch_xla.core.xla_model as xm
    XLA_AVAILABLE = True
 else:
    XLA_AVAILABLE = False
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class Flux2LoopDenoiser(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [ComponentSpec("transformer", Flux2Transformer2DModel)]
    @property
    def description(self) -> str:
        return (
            "Step within the denoising loop that denoises the latents for Flux2. "
            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
            "object (e.g. `Flux2DenoiseLoopWrapper`)"
        )
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        return [
            InputParam("joint_attention_kwargs"),
            InputParam(
                "latents",
                required=True,
                type_hint=torch.Tensor,
                description="The latents to denoise. Shape: (B, seq_len, C)",
            ),
            InputParam(
                "image_latents",
                type_hint=torch.Tensor,
                description="Packed image latents for conditioning. Shape: (B, img_seq_len, C)",
            ),
            InputParam(
                "image_latent_ids",
                type_hint=torch.Tensor,
                description="Position IDs for image latents. Shape: (B, img_seq_len, 4)",
            ),
            InputParam(
                "guidance",
                required=True,
                type_hint=torch.Tensor,
                description="Guidance scale as a tensor",
            ),
            InputParam(
                "prompt_embeds",
                required=True,
                type_hint=torch.Tensor,
                description="Text embeddings from Mistral3",
            ),
            InputParam(
                "txt_ids",
                required=True,
                type_hint=torch.Tensor,
                description="4D position IDs for text tokens (T, H, W, L)",
            ),
            InputParam(
                "latent_ids",
                required=True,
                type_hint=torch.Tensor,
                description="4D position IDs for latent tokens (T, H, W, L)",
            ),
        ]
    @torch.no_grad()
    def __call__(
        self, components: Flux2ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
    ) -> PipelineState:
        latents = block_state.latents
        latent_model_input = latents.to(components.transformer.dtype)
        img_ids = block_state.latent_ids
        image_latents = getattr(block_state, "image_latents", None)
        if image_latents is not None:
            latent_model_input = torch.cat([latents, image_latents], dim=1).to(components.transformer.dtype)
            image_latent_ids = block_state.image_latent_ids
            img_ids = torch.cat([img_ids, image_latent_ids], dim=1)
        timestep = t.expand(latents.shape[0]).to(latents.dtype)
        noise_pred = components.transformer(
            hidden_states=latent_model_input,
            timestep=timestep / 1000,
            guidance=block_state.guidance,
            encoder_hidden_states=block_state.prompt_embeds,
            txt_ids=block_state.txt_ids,
            img_ids=img_ids,
            joint_attention_kwargs=block_state.joint_attention_kwargs,
            return_dict=False,
        )[0]
        noise_pred = noise_pred[:, : latents.size(1)]
        block_state.noise_pred = noise_pred
        return components, block_state
 class Flux2LoopAfterDenoiser(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
    @property
    def description(self) -> str:
        return (
            "Step within the denoising loop that updates the latents after denoising. "
            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
            "object (e.g. `Flux2DenoiseLoopWrapper`)"
        )
    @property
    def inputs(self) -> List[Tuple[str, Any]]:
        return []
    @property
    def intermediate_inputs(self) -> List[str]:
        return [InputParam("generator")]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents")]
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
        latents_dtype = block_state.latents.dtype
        block_state.latents = components.scheduler.step(
            block_state.noise_pred,
            t,
            block_state.latents,
            return_dict=False,
        )[0]
        if block_state.latents.dtype != latents_dtype:
            if torch.backends.mps.is_available():
                block_state.latents = block_state.latents.to(latents_dtype)
        return components, block_state
 class Flux2DenoiseLoopWrapper(LoopSequentialPipelineBlocks):
    model_name = "flux2"
    @property
    def description(self) -> str:
        return (
            "Pipeline block that iteratively denoises the latents over `timesteps`. "
            "The specific steps within each iteration can be customized with `sub_blocks` attribute"
        )
    @property
    def loop_expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
            ComponentSpec("transformer", Flux2Transformer2DModel),
        ]
    @property
    def loop_inputs(self) -> List[InputParam]:
        return [
            InputParam(
                "timesteps",
                required=True,
                type_hint=torch.Tensor,
                description="The timesteps to use for the denoising process.",
            ),
            InputParam(
                "num_inference_steps",
                required=True,
                type_hint=int,
                description="The number of inference steps to use for the denoising process.",
            ),
        ]
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        block_state.num_warmup_steps = max(
            len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
        )
        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
            for i, t in enumerate(block_state.timesteps):
                components, block_state = self.loop_step(components, block_state, i=i, t=t)
                if i == len(block_state.timesteps) - 1 or (
                    (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0
                ):
                    progress_bar.update()
                if XLA_AVAILABLE:
                    xm.mark_step()
        self.set_block_state(state, block_state)
        return components, state
 class Flux2DenoiseStep(Flux2DenoiseLoopWrapper):
    block_classes = [Flux2LoopDenoiser, Flux2LoopAfterDenoiser]
    block_names = ["denoiser", "after_denoiser"]
    @property
    def description(self) -> str:
        return (
            "Denoise step that iteratively denoises the latents for Flux2. \n"
            "Its loop logic is defined in `Flux2DenoiseLoopWrapper.__call__` method \n"
            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
            " - `Flux2LoopDenoiser`\n"
            " - `Flux2LoopAfterDenoiser`\n"
            "This block supports both text-to-image and image-conditioned generation."
        )
--- a/src/diffusers/modular_pipelines/flux2/encoders.py
+++ b/src/diffusers/modular_pipelines/flux2/encoders.py
@@ -0,0 +1,349 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Optional, Tuple, Union
 import torch
 from transformers import AutoProcessor, Mistral3ForConditionalGeneration
 from ...models import AutoencoderKLFlux2
 from ...utils import logging
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import Flux2ModularPipeline
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def format_text_input(prompts: List[str], system_message: str = None):
    """Format prompts for Mistral3 chat template."""
    cleaned_txt = [prompt.replace("[IMG]", "") for prompt in prompts]
    return [
        [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {"role": "user", "content": [{"type": "text", "text": prompt}]},
        ]
        for prompt in cleaned_txt
    ]
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
 ):
    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
        return encoder_output.latent_dist.sample(generator)
    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
        return encoder_output.latent_dist.mode()
    elif hasattr(encoder_output, "latents"):
        return encoder_output.latents
    else:
        raise AttributeError("Could not access latents of provided encoder_output")
 class Flux2TextEncoderStep(ModularPipelineBlocks):
    model_name = "flux2"
    # fmt: off
    DEFAULT_SYSTEM_MESSAGE = "You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object attribution and actions without speculation."
    # fmt: on
    @property
    def description(self) -> str:
        return "Text Encoder step that generates text embeddings using Mistral3 to guide the image generation"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec("text_encoder", Mistral3ForConditionalGeneration),
            ComponentSpec("tokenizer", AutoProcessor),
        ]
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("prompt"),
            InputParam("prompt_embeds", type_hint=torch.Tensor, required=False),
            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
            InputParam("text_encoder_out_layers", type_hint=Tuple[int], default=(10, 20, 30), required=False),
            InputParam("joint_attention_kwargs"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "prompt_embeds",
                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
                description="Text embeddings from Mistral3 used to guide the image generation",
            ),
        ]
    @staticmethod
    def check_inputs(block_state):
        prompt = block_state.prompt
        prompt_embeds = getattr(block_state, "prompt_embeds", None)
        if prompt is not None and prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. "
                "Please make sure to only forward one of the two."
            )
        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
    @staticmethod
    def _get_mistral_3_prompt_embeds(
        text_encoder: Mistral3ForConditionalGeneration,
        tokenizer: AutoProcessor,
        prompt: Union[str, List[str]],
        dtype: Optional[torch.dtype] = None,
        device: Optional[torch.device] = None,
        max_sequence_length: int = 512,
        # fmt: off
        system_message: str = "You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object attribution and actions without speculation.",
        # fmt: on
        hidden_states_layers: Tuple[int] = (10, 20, 30),
    ):
        dtype = text_encoder.dtype if dtype is None else dtype
        device = text_encoder.device if device is None else device
        prompt = [prompt] if isinstance(prompt, str) else prompt
        messages_batch = format_text_input(prompts=prompt, system_message=system_message)
        inputs = tokenizer.apply_chat_template(
            messages_batch,
            add_generation_prompt=False,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=max_sequence_length,
        )
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        output = text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            use_cache=False,
        )
        out = torch.stack([output.hidden_states[k] for k in hidden_states_layers], dim=1)
        out = out.to(dtype=dtype, device=device)
        batch_size, num_channels, seq_len, hidden_dim = out.shape
        prompt_embeds = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, num_channels * hidden_dim)
        return prompt_embeds
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        self.check_inputs(block_state)
        block_state.device = components._execution_device
        if block_state.prompt_embeds is not None:
            self.set_block_state(state, block_state)
            return components, state
        prompt = block_state.prompt
        if prompt is None:
            prompt = ""
        prompt = [prompt] if isinstance(prompt, str) else prompt
        block_state.prompt_embeds = self._get_mistral_3_prompt_embeds(
            text_encoder=components.text_encoder,
            tokenizer=components.tokenizer,
            prompt=prompt,
            device=block_state.device,
            max_sequence_length=block_state.max_sequence_length,
            system_message=self.DEFAULT_SYSTEM_MESSAGE,
            hidden_states_layers=block_state.text_encoder_out_layers,
        )
        self.set_block_state(state, block_state)
        return components, state
 class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
    model_name = "flux2"
    REMOTE_URL = "https://remote-text-encoder-flux-2.huggingface.co/predict"
    @property
    def description(self) -> str:
        return "Text Encoder step that generates text embeddings using a remote API endpoint"
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return []
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("prompt"),
            InputParam("prompt_embeds", type_hint=torch.Tensor, required=False),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "prompt_embeds",
                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
                description="Text embeddings from remote API used to guide the image generation",
            ),
        ]
    @staticmethod
    def check_inputs(block_state):
        prompt = block_state.prompt
        prompt_embeds = getattr(block_state, "prompt_embeds", None)
        if prompt is not None and prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. "
                "Please make sure to only forward one of the two."
            )
        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        import io
        import requests
        from huggingface_hub import get_token
        block_state = self.get_block_state(state)
        self.check_inputs(block_state)
        block_state.device = components._execution_device
        if block_state.prompt_embeds is not None:
            self.set_block_state(state, block_state)
            return components, state
        prompt = block_state.prompt
        if prompt is None:
            prompt = ""
        prompt = [prompt] if isinstance(prompt, str) else prompt
        response = requests.post(
            self.REMOTE_URL,
            json={"prompt": prompt},
            headers={
                "Authorization": f"Bearer {get_token()}",
                "Content-Type": "application/json",
            },
        )
        response.raise_for_status()
        block_state.prompt_embeds = torch.load(io.BytesIO(response.content), weights_only=True)
        block_state.prompt_embeds = block_state.prompt_embeds.to(block_state.device)
        self.set_block_state(state, block_state)
        return components, state
 class Flux2VaeEncoderStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def description(self) -> str:
        return "VAE Encoder step that encodes preprocessed images into latent representations for Flux2."
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [ComponentSpec("vae", AutoencoderKLFlux2)]
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("condition_images", type_hint=List[torch.Tensor]),
            InputParam("generator"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(
                "image_latents",
                type_hint=List[torch.Tensor],
                description="List of latent representations for each reference image",
            ),
        ]
    @staticmethod
    def _patchify_latents(latents):
        """Convert latents to patchified format for Flux2."""
        batch_size, num_channels_latents, height, width = latents.shape
        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
        latents = latents.permute(0, 1, 3, 5, 2, 4)
        latents = latents.reshape(batch_size, num_channels_latents * 4, height // 2, width // 2)
        return latents
    def _encode_vae_image(self, vae: AutoencoderKLFlux2, image: torch.Tensor, generator: torch.Generator):
        """Encode a single image using Flux2 VAE with batch norm normalization."""
        if image.ndim != 4:
            raise ValueError(f"Expected image dims 4, got {image.ndim}.")
        image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode="argmax")
        image_latents = self._patchify_latents(image_latents)
        latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(image_latents.device, image_latents.dtype)
        latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps)
        latents_bn_std = latents_bn_std.to(image_latents.device, image_latents.dtype)
        image_latents = (image_latents - latents_bn_mean) / latents_bn_std
        return image_latents
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        condition_images = block_state.condition_images
        if condition_images is None:
            return components, state
        device = components._execution_device
        dtype = components.vae.dtype
        image_latents = []
        for image in condition_images:
            image = image.to(device=device, dtype=dtype)
            latent = self._encode_vae_image(
                vae=components.vae,
                image=image,
                generator=block_state.generator,
            )
            image_latents.append(latent)
        block_state.image_latents = image_latents
        self.set_block_state(state, block_state)
        return components, state
--- a/src/diffusers/modular_pipelines/flux2/inputs.py
+++ b/src/diffusers/modular_pipelines/flux2/inputs.py
@@ -0,0 +1,160 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import torch
 from ...configuration_utils import FrozenDict
 from ...pipelines.flux2.image_processor import Flux2ImageProcessor
 from ...utils import logging
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import Flux2ModularPipeline
 logger = logging.get_logger(__name__)
 class Flux2TextInputStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def description(self) -> str:
        return (
            "This step:\n"
            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
            "  2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
        )
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("num_images_per_prompt", default=1),
            InputParam(
                "prompt_embeds",
                required=True,
                kwargs_type="denoiser_input_fields",
                type_hint=torch.Tensor,
                description="Pre-generated text embeddings from Mistral3. Can be generated from text_encoder step.",
            ),
        ]
    @property
    def intermediate_outputs(self) -> List[str]:
        return [
            OutputParam(
                "batch_size",
                type_hint=int,
                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
            ),
            OutputParam(
                "dtype",
                type_hint=torch.dtype,
                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
            ),
            OutputParam(
                "prompt_embeds",
                type_hint=torch.Tensor,
                kwargs_type="denoiser_input_fields",
                description="Text embeddings used to guide the image generation",
            ),
        ]
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        block_state.batch_size = block_state.prompt_embeds.shape[0]
        block_state.dtype = block_state.prompt_embeds.dtype
        _, seq_len, _ = block_state.prompt_embeds.shape
        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
        block_state.prompt_embeds = block_state.prompt_embeds.view(
            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
        )
        self.set_block_state(state, block_state)
        return components, state
 class Flux2ProcessImagesInputStep(ModularPipelineBlocks):
    model_name = "flux2"
    @property
    def description(self) -> str:
        return "Image preprocess step for Flux2. Validates and preprocesses reference images."
    @property
    def expected_components(self) -> List[ComponentSpec]:
        return [
            ComponentSpec(
                "image_processor",
                Flux2ImageProcessor,
                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 32}),
                default_creation_method="from_config",
            ),
        ]
    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("image"),
            InputParam("height"),
            InputParam("width"),
        ]
    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [OutputParam(name="condition_images", type_hint=List[torch.Tensor])]
    @torch.no_grad()
    def __call__(self, components: Flux2ModularPipeline, state: PipelineState):
        block_state = self.get_block_state(state)
        images = block_state.image
        if images is None:
            block_state.condition_images = None
            self.set_block_state(state, block_state)
            return components, state
        if not isinstance(images, list):
            images = [images]
        condition_images = []
        for img in images:
            components.image_processor.check_image_input(img)
            image_width, image_height = img.size
            if image_width * image_height > 1024 * 1024:
                img = components.image_processor._resize_to_target_area(img, 1024 * 1024)
                image_width, image_height = img.size
            multiple_of = components.vae_scale_factor * 2
            image_width = (image_width // multiple_of) * multiple_of
            image_height = (image_height // multiple_of) * multiple_of
            condition_img = components.image_processor.preprocess(
                img, height=image_height, width=image_width, resize_mode="crop"
            )
            condition_images.append(condition_img)
            if block_state.height is None:
                block_state.height = image_height
            if block_state.width is None:
                block_state.width = image_width
        block_state.condition_images = condition_images
        self.set_block_state(state, block_state)
        return components, state
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks.py
@@ -0,0 +1,166 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict
 from .before_denoise import (
    Flux2PrepareImageLatentsStep,
    Flux2PrepareLatentsStep,
    Flux2RoPEInputsStep,
    Flux2SetTimestepsStep,
 )
 from .decoders import Flux2DecodeStep
 from .denoise import Flux2DenoiseStep
 from .encoders import (
    Flux2RemoteTextEncoderStep,
    Flux2TextEncoderStep,
    Flux2VaeEncoderStep,
 )
 from .inputs import (
    Flux2ProcessImagesInputStep,
    Flux2TextInputStep,
 )
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 Flux2VaeEncoderBlocks = InsertableDict(
    [
        ("preprocess", Flux2ProcessImagesInputStep()),
        ("encode", Flux2VaeEncoderStep()),
        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
    ]
 )
 class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks):
    model_name = "flux2"
    block_classes = Flux2VaeEncoderBlocks.values()
    block_names = Flux2VaeEncoderBlocks.keys()
    @property
    def description(self) -> str:
        return "VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning."
 class Flux2AutoVaeEncoderStep(AutoPipelineBlocks):
    block_classes = [Flux2VaeEncoderSequentialStep]
    block_names = ["img_conditioning"]
    block_trigger_inputs = ["image"]
    @property
    def description(self):
        return (
            "VAE encoder step that encodes the image inputs into their latent representations.\n"
            "This is an auto pipeline block that works for image conditioning tasks.\n"
            " - `Flux2VaeEncoderSequentialStep` is used when `image` is provided.\n"
            " - If `image` is not provided, step will be skipped."
        )
 Flux2BeforeDenoiseBlocks = InsertableDict(
    [
        ("prepare_latents", Flux2PrepareLatentsStep()),
        ("set_timesteps", Flux2SetTimestepsStep()),
        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
    ]
 )
 class Flux2BeforeDenoiseStep(SequentialPipelineBlocks):
    model_name = "flux2"
    block_classes = Flux2BeforeDenoiseBlocks.values()
    block_names = Flux2BeforeDenoiseBlocks.keys()
    @property
    def description(self):
        return "Before denoise step that prepares the inputs for the denoise step in Flux2 generation."
 AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", Flux2TextEncoderStep()),
        ("text_input", Flux2TextInputStep()),
        ("vae_image_encoder", Flux2AutoVaeEncoderStep()),
        ("before_denoise", Flux2BeforeDenoiseStep()),
        ("denoise", Flux2DenoiseStep()),
        ("decode", Flux2DecodeStep()),
    ]
 )
 REMOTE_AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", Flux2RemoteTextEncoderStep()),
        ("text_input", Flux2TextInputStep()),
        ("vae_image_encoder", Flux2AutoVaeEncoderStep()),
        ("before_denoise", Flux2BeforeDenoiseStep()),
        ("denoise", Flux2DenoiseStep()),
        ("decode", Flux2DecodeStep()),
    ]
 )
 class Flux2AutoBlocks(SequentialPipelineBlocks):
    model_name = "flux2"
    block_classes = AUTO_BLOCKS.values()
    block_names = AUTO_BLOCKS.keys()
    @property
    def description(self):
        return (
            "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.\n"
            "- For text-to-image generation, all you need to provide is `prompt`.\n"
            "- For image-conditioned generation, you need to provide `image` (list of PIL images)."
        )
 TEXT2IMAGE_BLOCKS = InsertableDict(
    [
        ("text_encoder", Flux2TextEncoderStep()),
        ("text_input", Flux2TextInputStep()),
        ("prepare_latents", Flux2PrepareLatentsStep()),
        ("set_timesteps", Flux2SetTimestepsStep()),
        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
        ("denoise", Flux2DenoiseStep()),
        ("decode", Flux2DecodeStep()),
    ]
 )
 IMAGE_CONDITIONED_BLOCKS = InsertableDict(
    [
        ("text_encoder", Flux2TextEncoderStep()),
        ("text_input", Flux2TextInputStep()),
        ("preprocess_images", Flux2ProcessImagesInputStep()),
        ("vae_encoder", Flux2VaeEncoderStep()),
        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
        ("prepare_latents", Flux2PrepareLatentsStep()),
        ("set_timesteps", Flux2SetTimestepsStep()),
        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
        ("denoise", Flux2DenoiseStep()),
        ("decode", Flux2DecodeStep()),
    ]
 )
 ALL_BLOCKS = {
    "text2image": TEXT2IMAGE_BLOCKS,
    "image_conditioned": IMAGE_CONDITIONED_BLOCKS,
    "auto": AUTO_BLOCKS,
    "remote": REMOTE_AUTO_BLOCKS,
 }
--- a/src/diffusers/modular_pipelines/flux2/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_pipeline.py
@@ -0,0 +1,57 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ...loaders import Flux2LoraLoaderMixin
 from ...utils import logging
 from ..modular_pipeline import ModularPipeline
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class Flux2ModularPipeline(ModularPipeline, Flux2LoraLoaderMixin):
    """
    A ModularPipeline for Flux2.
    > [!WARNING] > This is an experimental feature and is likely to change in the future.
    """
    default_blocks_name = "Flux2AutoBlocks"
    @property
    def default_height(self):
        return self.default_sample_size * self.vae_scale_factor
    @property
    def default_width(self):
        return self.default_sample_size * self.vae_scale_factor
    @property
    def default_sample_size(self):
        return 128
    @property
    def vae_scale_factor(self):
        vae_scale_factor = 8
        if getattr(self, "vae", None) is not None:
            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        return vae_scale_factor
    @property
    def num_channels_latents(self):
        num_channels_latents = 32
        if getattr(self, "transformer", None):
            num_channels_latents = self.transformer.config.in_channels // 4
        return num_channels_latents
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -58,6 +58,7 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
        ("wan", "WanModularPipeline"),
        ("flux", "FluxModularPipeline"),
        ("flux-kontext", "FluxKontextModularPipeline"),
        ("flux2", "Flux2ModularPipeline"),
        ("qwenimage", "QwenImageModularPipeline"),
        ("qwenimage-edit", "QwenImageEditModularPipeline"),
        ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
@@ -1585,7 +1586,6 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
        for name, config_spec in self._config_specs.items():
            default_configs[name] = config_spec.default
        self.register_to_config(**default_configs)
        self.register_to_config(_blocks_class_name=self.blocks.__class__.__name__ if self.blocks is not None else None)
    @property
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2,6 +2,36 @@
 from ..utils import DummyObject, requires_backends
 class Flux2AutoBlocks(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch", "transformers"])
    @classmethod
    def from_config(cls, *args, **kwargs):
        requires_backends(cls, ["torch", "transformers"])
    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        requires_backends(cls, ["torch", "transformers"])
 class Flux2ModularPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["torch", "transformers"])
    @classmethod
    def from_config(cls, *args, **kwargs):
        requires_backends(cls, ["torch", "transformers"])
    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        requires_backends(cls, ["torch", "transformers"])
 class FluxAutoBlocks(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
--- a/tests/modular_pipelines/flux2/init.py
+++ b/tests/modular_pipelines/flux2/init.py
--- a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2.py
@@ -0,0 +1,93 @@
 # coding=utf-8
 # Copyright 2025 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import random
 import numpy as np
 import PIL
 import pytest
 from diffusers.modular_pipelines import (
    Flux2AutoBlocks,
    Flux2ModularPipeline,
 )
 from ...testing_utils import floats_tensor, torch_device
 from ..test_modular_pipelines_common import ModularPipelineTesterMixin
 class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
    pipeline_class = Flux2ModularPipeline
    pipeline_blocks_class = Flux2AutoBlocks
    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-modular"
    params = frozenset(["prompt", "height", "width", "guidance_scale"])
    batch_params = frozenset(["prompt"])
    def get_dummy_inputs(self, seed=0):
        generator = self.get_generator(seed)
        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
            "text_encoder_out_layers": (1,),
            "generator": generator,
            "num_inference_steps": 2,
            "guidance_scale": 4.0,
            "height": 32,
            "width": 32,
            "output_type": "pt",
        }
        return inputs
    def test_float16_inference(self):
        super().test_float16_inference(9e-2)
 class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
    pipeline_class = Flux2ModularPipeline
    pipeline_blocks_class = Flux2AutoBlocks
    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-modular"
    params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
    batch_params = frozenset(["prompt", "image"])
    def get_dummy_inputs(self, seed=0):
        generator = self.get_generator(seed)
        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
            "text_encoder_out_layers": (1,),
            "generator": generator,
            "num_inference_steps": 2,
            "guidance_scale": 4.0,
            "height": 32,
            "width": 32,
            "output_type": "pt",
        }
        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(torch_device)
        image = image.cpu().permute(0, 2, 3, 1)[0]
        init_image = PIL.Image.fromarray(np.uint8(image * 255)).convert("RGB")
        inputs["image"] = init_image
        return inputs
    def test_float16_inference(self):
        super().test_float16_inference(9e-2)
    @pytest.mark.skip(reason="batched inference is currently not supported")
    def test_inference_batch_single_identical(self, batch_size=2, expected_max_diff=0.0001):
        return
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -165,7 +165,6 @@ class ModularPipelineTesterMixin:
        expected_max_diff=1e-4,
    ):
        pipe = self.get_pipeline().to(torch_device)
        inputs = self.get_dummy_inputs()
        # Reset generator in case it is has been used in self.get_dummy_inputs
Author	SHA1	Message	Date
DN6	21ecee5655	update	2025-12-10 11:29:05 +05:30
DN6	04de34b82e	update	2025-12-10 11:18:10 +05:30
DN6	2e8c97b734	update	2025-12-10 11:01:21 +05:30
DN6	3806a9add3	update	2025-12-10 09:16:39 +05:30
DN6	75876748e5	update	2025-12-09 14:06:09 +05:30
DN6	771512a46d	update	2025-12-09 14:05:44 +05:30
DN6	b0f50c64e1	update	2025-12-09 12:18:30 +05:30
DN6	921b959b9a	update	2025-12-01 10:42:35 +05:30
DN6	9391a5465d	Merge branch 'main' into flux2-modular	2025-11-28 22:23:38 +05:30
DN6	d780d1a42a	update	2025-11-28 15:27:30 +05:30
DN6	9264459f88	update	2025-11-28 12:57:00 +05:30