Skip to content

BriaTransformer2DModel

A modified flux Transformer model from Bria

mindone.diffusers.BriaTransformer2DModel

Bases: ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin

The Transformer model introduced in Flux. Based on FluxPipeline with several changes: - no pooled embeddings - We use zero padding for prompts - No guidance embedding since this is not a distilled version Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

PARAMETER DESCRIPTION
patch_size

Patch size to turn the input data into small patches.

TYPE: `int` DEFAULT: 1

in_channels

The number of channels in the input.

TYPE: `int`, *optional*, defaults to 16 DEFAULT: 64

num_layers

The number of layers of MMDiT blocks to use.

TYPE: `int`, *optional*, defaults to 18 DEFAULT: 19

num_single_layers

The number of layers of single DiT blocks to use.

TYPE: `int`, *optional*, defaults to 18 DEFAULT: 38

attention_head_dim

The number of channels in each head.

TYPE: `int`, *optional*, defaults to 64 DEFAULT: 128

num_attention_heads

The number of heads to use for multi-head attention.

TYPE: `int`, *optional*, defaults to 18 DEFAULT: 24

joint_attention_dim

The number of encoder_hidden_states dimensions to use.

TYPE: `int`, *optional* DEFAULT: 4096

pooled_projection_dim

Number of dimensions to use when projecting the pooled_projections.

TYPE: `int` DEFAULT: None

guidance_embeds

Whether to use guidance embeddings.

TYPE: `bool`, defaults to False DEFAULT: False

Source code in mindone/diffusers/models/transformers/transformer_bria.py
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
class BriaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
    """
    The Transformer model introduced in Flux. Based on FluxPipeline with several changes:
    - no pooled embeddings
    - We use zero padding for prompts
    - No guidance embedding since this is not a distilled version
    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

    Parameters:
        patch_size (`int`): Patch size to turn the input data into small patches.
        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
        guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
    """

    _supports_gradient_checkpointing = True

    @register_to_config
    def __init__(
        self,
        patch_size: int = 1,
        in_channels: int = 64,
        num_layers: int = 19,
        num_single_layers: int = 38,
        attention_head_dim: int = 128,
        num_attention_heads: int = 24,
        joint_attention_dim: int = 4096,
        pooled_projection_dim: int = None,
        guidance_embeds: bool = False,
        axes_dims_rope: List[int] = [16, 56, 56],
        rope_theta=10000,
        time_theta=10000,
    ):
        super().__init__()
        self.out_channels = in_channels
        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim

        self.pos_embed = BriaEmbedND(theta=rope_theta, axes_dim=axes_dims_rope)

        self.time_embed = BriaTimestepProjEmbeddings(embedding_dim=self.inner_dim, time_theta=time_theta)
        if guidance_embeds:
            self.guidance_embed = BriaTimestepProjEmbeddings(embedding_dim=self.inner_dim)

        self.context_embedder = mint.nn.Linear(self.config.joint_attention_dim, self.inner_dim)
        self.x_embedder = mint.nn.Linear(self.config.in_channels, self.inner_dim)

        self.transformer_blocks = nn.CellList(
            [
                BriaTransformerBlock(
                    dim=self.inner_dim,
                    num_attention_heads=self.config.num_attention_heads,
                    attention_head_dim=self.config.attention_head_dim,
                )
                for i in range(self.config.num_layers)
            ]
        )

        self.single_transformer_blocks = nn.CellList(
            [
                BriaSingleTransformerBlock(
                    dim=self.inner_dim,
                    num_attention_heads=self.config.num_attention_heads,
                    attention_head_dim=self.config.attention_head_dim,
                )
                for i in range(self.config.num_single_layers)
            ]
        )

        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
        self.proj_out = mint.nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)

        self.gradient_checkpointing = False

    def construct(
        self,
        hidden_states: ms.Tensor,
        encoder_hidden_states: ms.Tensor = None,
        pooled_projections: ms.Tensor = None,
        timestep: ms.Tensor = None,
        img_ids: ms.Tensor = None,
        txt_ids: ms.Tensor = None,
        guidance: ms.Tensor = None,
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
        controlnet_block_samples=None,
        controlnet_single_block_samples=None,
    ) -> Union[Tuple[ms.Tensor], Transformer2DModelOutput]:
        """
        The [`BriaTransformer2DModel`] forward method.

        Args:
            hidden_states (`ms.Tensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            encoder_hidden_states (`ms.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`ms.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `ms.Tensor`):
                Used to indicate denoising step.
            block_controlnet_hidden_states: (`list` of `ms.Tensor`):
                A list of tensors that if specified are added to the residuals of transformer blocks.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
        if attention_kwargs is not None and "scale" in attention_kwargs:
            # weight the lora layers by setting `lora_scale` for each PEFT layer here
            # and remove `lora_scale` from each PEFT layer at the end.
            # scale_lora_layers & unscale_lora_layers maybe contains some operation forbidden in graph mode
            raise RuntimeError(
                f"You are trying to set scaling of lora layer by passing {attention_kwargs['scale']=}. "
                f"However it's not allowed in on-the-fly model forwarding. "
                f"Please manually call `scale_lora_layers(model, lora_scale)` before model forwarding and "
                f"`unscale_lora_layers(model, lora_scale)` after model forwarding. "
                f"For example, it can be done in a pipeline call like `StableDiffusionPipeline.__call__`."
            )
        hidden_states = self.x_embedder(hidden_states)

        timestep = timestep.to(hidden_states.dtype)
        if guidance is not None:
            guidance = guidance.to(hidden_states.dtype)
        else:
            guidance = None

        temb = self.time_embed(timestep, dtype=hidden_states.dtype)

        if guidance:
            temb += self.guidance_embed(guidance, dtype=hidden_states.dtype)

        encoder_hidden_states = self.context_embedder(encoder_hidden_states)

        if len(txt_ids.shape) == 3:
            txt_ids = txt_ids[0]

        if len(img_ids.shape) == 3:
            img_ids = img_ids[0]

        ids = mint.cat((txt_ids, img_ids), dim=0)
        image_rotary_emb = self.pos_embed(ids)

        for index_block, block in enumerate(self.transformer_blocks):
            encoder_hidden_states, hidden_states = block(
                hidden_states=hidden_states,
                encoder_hidden_states=encoder_hidden_states,
                temb=temb,
                image_rotary_emb=image_rotary_emb,
            )

            # controlnet residual
            if controlnet_block_samples is not None:
                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
                interval_control = int(np.ceil(interval_control))
                hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]

        for index_block, block in enumerate(self.single_transformer_blocks):
            encoder_hidden_states, hidden_states = block(
                hidden_states=hidden_states,
                encoder_hidden_states=encoder_hidden_states,
                temb=temb,
                image_rotary_emb=image_rotary_emb,
            )

            # controlnet residual
            if controlnet_single_block_samples is not None:
                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
                interval_control = int(np.ceil(interval_control))
                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
                    + controlnet_single_block_samples[index_block // interval_control]
                )

        hidden_states = self.norm_out(hidden_states, temb)
        output = self.proj_out(hidden_states)

        if not return_dict:
            return (output,)

        return Transformer2DModelOutput(sample=output)

mindone.diffusers.BriaTransformer2DModel.construct(hidden_states, encoder_hidden_states=None, pooled_projections=None, timestep=None, img_ids=None, txt_ids=None, guidance=None, attention_kwargs=None, return_dict=True, controlnet_block_samples=None, controlnet_single_block_samples=None)

The [BriaTransformer2DModel] forward method.

PARAMETER DESCRIPTION
hidden_states

Input hidden_states.

TYPE: `ms.Tensor` of shape `(batch size, channel, height, width)`

encoder_hidden_states

Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.

TYPE: `ms.Tensor` of shape `(batch size, sequence_len, embed_dims)` DEFAULT: None

pooled_projections

Embeddings projected from the embeddings of input conditions.

TYPE: `ms.Tensor` of shape `(batch_size, projection_dim)` DEFAULT: None

timestep

Used to indicate denoising step.

TYPE: `ms.Tensor` DEFAULT: None

block_controlnet_hidden_states

(list of ms.Tensor): A list of tensors that if specified are added to the residuals of transformer blocks.

attention_kwargs

A kwargs dictionary that if specified is passed along to the AttentionProcessor as defined under self.processor in diffusers.models.attention_processor.

TYPE: `dict`, *optional* DEFAULT: None

return_dict

Whether or not to return a [~models.transformer_2d.Transformer2DModelOutput] instead of a plain tuple.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

RETURNS DESCRIPTION
Union[Tuple[Tensor], Transformer2DModelOutput]

If return_dict is True, an [~models.transformer_2d.Transformer2DModelOutput] is returned, otherwise a

Union[Tuple[Tensor], Transformer2DModelOutput]

tuple where the first element is the sample tensor.

Source code in mindone/diffusers/models/transformers/transformer_bria.py
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
def construct(
    self,
    hidden_states: ms.Tensor,
    encoder_hidden_states: ms.Tensor = None,
    pooled_projections: ms.Tensor = None,
    timestep: ms.Tensor = None,
    img_ids: ms.Tensor = None,
    txt_ids: ms.Tensor = None,
    guidance: ms.Tensor = None,
    attention_kwargs: Optional[Dict[str, Any]] = None,
    return_dict: bool = True,
    controlnet_block_samples=None,
    controlnet_single_block_samples=None,
) -> Union[Tuple[ms.Tensor], Transformer2DModelOutput]:
    """
    The [`BriaTransformer2DModel`] forward method.

    Args:
        hidden_states (`ms.Tensor` of shape `(batch size, channel, height, width)`):
            Input `hidden_states`.
        encoder_hidden_states (`ms.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
            Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
        pooled_projections (`ms.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
            from the embeddings of input conditions.
        timestep ( `ms.Tensor`):
            Used to indicate denoising step.
        block_controlnet_hidden_states: (`list` of `ms.Tensor`):
            A list of tensors that if specified are added to the residuals of transformer blocks.
        attention_kwargs (`dict`, *optional*):
            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
            `self.processor` in
            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
        return_dict (`bool`, *optional*, defaults to `True`):
            Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
            tuple.

    Returns:
        If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
        `tuple` where the first element is the sample tensor.
    """
    if attention_kwargs is not None and "scale" in attention_kwargs:
        # weight the lora layers by setting `lora_scale` for each PEFT layer here
        # and remove `lora_scale` from each PEFT layer at the end.
        # scale_lora_layers & unscale_lora_layers maybe contains some operation forbidden in graph mode
        raise RuntimeError(
            f"You are trying to set scaling of lora layer by passing {attention_kwargs['scale']=}. "
            f"However it's not allowed in on-the-fly model forwarding. "
            f"Please manually call `scale_lora_layers(model, lora_scale)` before model forwarding and "
            f"`unscale_lora_layers(model, lora_scale)` after model forwarding. "
            f"For example, it can be done in a pipeline call like `StableDiffusionPipeline.__call__`."
        )
    hidden_states = self.x_embedder(hidden_states)

    timestep = timestep.to(hidden_states.dtype)
    if guidance is not None:
        guidance = guidance.to(hidden_states.dtype)
    else:
        guidance = None

    temb = self.time_embed(timestep, dtype=hidden_states.dtype)

    if guidance:
        temb += self.guidance_embed(guidance, dtype=hidden_states.dtype)

    encoder_hidden_states = self.context_embedder(encoder_hidden_states)

    if len(txt_ids.shape) == 3:
        txt_ids = txt_ids[0]

    if len(img_ids.shape) == 3:
        img_ids = img_ids[0]

    ids = mint.cat((txt_ids, img_ids), dim=0)
    image_rotary_emb = self.pos_embed(ids)

    for index_block, block in enumerate(self.transformer_blocks):
        encoder_hidden_states, hidden_states = block(
            hidden_states=hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            temb=temb,
            image_rotary_emb=image_rotary_emb,
        )

        # controlnet residual
        if controlnet_block_samples is not None:
            interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
            interval_control = int(np.ceil(interval_control))
            hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]

    for index_block, block in enumerate(self.single_transformer_blocks):
        encoder_hidden_states, hidden_states = block(
            hidden_states=hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            temb=temb,
            image_rotary_emb=image_rotary_emb,
        )

        # controlnet residual
        if controlnet_single_block_samples is not None:
            interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
            interval_control = int(np.ceil(interval_control))
            hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
                hidden_states[:, encoder_hidden_states.shape[1] :, ...]
                + controlnet_single_block_samples[index_block // interval_control]
            )

    hidden_states = self.norm_out(hidden_states, temb)
    output = self.proj_out(hidden_states)

    if not return_dict:
        return (output,)

    return Transformer2DModelOutput(sample=output)