StableAudioDiTModel¶

A Transformer model for audio waveforms from Stable Audio Open.

`mindone.diffusers.StableAudioDiTModel` ¶

Bases: ModelMixin, ConfigMixin

The Diffusion Transformer model introduced in Stable Audio.

Reference: https://github.com/Stability-AI/stable-audio-tools

PARAMETER	DESCRIPTION
`sample_size`	The size of the input sample. TYPE: `int`, optional, defaults to 1024 DEFAULT: `1024`
`in_channels`	The number of channels in the input. TYPE: `int`, optional, defaults to 64 DEFAULT: `64`
`num_layers`	The number of layers of Transformer blocks to use. TYPE: `int`, optional, defaults to 24 DEFAULT: `24`
`attention_head_dim`	The number of channels in each head. TYPE: `int`, optional, defaults to 64 DEFAULT: `64`
`num_attention_heads`	The number of heads to use for the query states. TYPE: `int`, optional, defaults to 24 DEFAULT: `24`
`num_key_value_attention_heads`	The number of heads to use for the key and value states. TYPE: `int`, optional, defaults to 12 DEFAULT: `12`
`out_channels`	Number of output channels. TYPE: `int`, defaults to 64 DEFAULT: `64`
`cross_attention_dim`	Dimension of the cross-attention projection. TYPE: `int`, optional, defaults to 768 DEFAULT: `768`
`time_proj_dim`	Dimension of the timestep inner projection. TYPE: `int`, optional, defaults to 256 DEFAULT: `256`
`global_states_input_dim`	Input dimension of the global hidden states projection. TYPE: `int`, optional, defaults to 1536 DEFAULT: `1536`
`cross_attention_input_dim`	Input dimension of the cross-attention projection TYPE: `int`, optional, defaults to 768 DEFAULT: `768`

Source code in mindone/diffusers/models/transformers/stable_audio_transformer.py

class StableAudioDiTModel(ModelMixin, ConfigMixin):
    """
    The Diffusion Transformer model introduced in Stable Audio.

    Reference: https://github.com/Stability-AI/stable-audio-tools

    Parameters:
        sample_size ( `int`, *optional*, defaults to 1024): The size of the input sample.
        in_channels (`int`, *optional*, defaults to 64): The number of channels in the input.
        num_layers (`int`, *optional*, defaults to 24): The number of layers of Transformer blocks to use.
        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
        num_attention_heads (`int`, *optional*, defaults to 24): The number of heads to use for the query states.
        num_key_value_attention_heads (`int`, *optional*, defaults to 12):
            The number of heads to use for the key and value states.
        out_channels (`int`, defaults to 64): Number of output channels.
        cross_attention_dim ( `int`, *optional*, defaults to 768): Dimension of the cross-attention projection.
        time_proj_dim ( `int`, *optional*, defaults to 256): Dimension of the timestep inner projection.
        global_states_input_dim ( `int`, *optional*, defaults to 1536):
            Input dimension of the global hidden states projection.
        cross_attention_input_dim ( `int`, *optional*, defaults to 768):
            Input dimension of the cross-attention projection
    """

    _supports_gradient_checkpointing = True

    @register_to_config
    def __init__(
        self,
        sample_size: int = 1024,
        in_channels: int = 64,
        num_layers: int = 24,
        attention_head_dim: int = 64,
        num_attention_heads: int = 24,
        num_key_value_attention_heads: int = 12,
        out_channels: int = 64,
        cross_attention_dim: int = 768,
        time_proj_dim: int = 256,
        global_states_input_dim: int = 1536,
        cross_attention_input_dim: int = 768,
    ):
        super().__init__()
        self.sample_size = sample_size
        self.out_channels = out_channels
        self.inner_dim = num_attention_heads * attention_head_dim

        self.time_proj = StableAudioGaussianFourierProjection(
            embedding_size=time_proj_dim // 2,
            flip_sin_to_cos=True,
            log=False,
            set_W_to_weight=False,
        )

        self.timestep_proj = nn.SequentialCell(
            nn.Dense(time_proj_dim, self.inner_dim, has_bias=True),
            nn.SiLU(),
            nn.Dense(self.inner_dim, self.inner_dim, has_bias=True),
        )

        self.global_proj = nn.SequentialCell(
            nn.Dense(global_states_input_dim, self.inner_dim, has_bias=False),
            nn.SiLU(),
            nn.Dense(self.inner_dim, self.inner_dim, has_bias=False),
        )

        self.cross_attention_proj = nn.SequentialCell(
            nn.Dense(cross_attention_input_dim, cross_attention_dim, has_bias=False),
            nn.SiLU(),
            nn.Dense(cross_attention_dim, cross_attention_dim, has_bias=False),
        )

        self.preprocess_conv = nn.Conv1d(in_channels, in_channels, 1, has_bias=False)
        self.proj_in = nn.Dense(in_channels, self.inner_dim, has_bias=False)

        self.transformer_blocks = nn.CellList(
            [
                StableAudioDiTBlock(
                    dim=self.inner_dim,
                    num_attention_heads=num_attention_heads,
                    num_key_value_attention_heads=num_key_value_attention_heads,
                    attention_head_dim=attention_head_dim,
                    cross_attention_dim=cross_attention_dim,
                )
                for i in range(num_layers)
            ]
        )

        self.proj_out = nn.Dense(self.inner_dim, self.out_channels, has_bias=False)
        self.postprocess_conv = nn.Conv1d(self.out_channels, self.out_channels, 1, has_bias=False)

        self.gradient_checkpointing = False

    @property
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
    def attn_processors(self) -> Dict[str, AttentionProcessor]:
        r"""
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        """
        # set recursively
        processors = {}

        def fn_recursive_add_processors(name: str, module: nn.Cell, processors: Dict[str, AttentionProcessor]):
            if hasattr(module, "get_processor"):
                processors[f"{name}.processor"] = module.get_processor()

            for sub_name, child in module.named_children():
                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)

            return processors

        for name, module in self.named_children():
            fn_recursive_add_processors(name, module, processors)

        return processors

    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
        r"""
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        """
        count = len(self.attn_processors.keys())

        if isinstance(processor, dict) and len(processor) != count:
            raise ValueError(
                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
            )

        def fn_recursive_attn_processor(name: str, module: nn.Cell, processor):
            if hasattr(module, "set_processor"):
                if not isinstance(processor, dict):
                    module.set_processor(processor)
                else:
                    module.set_processor(processor.pop(f"{name}.processor"))

            for sub_name, child in module.named_children():
                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

        for name, module in self.named_children():
            fn_recursive_attn_processor(name, module, processor)

    # Copied from diffusers.models.transformers.hunyuan_transformer_2d.HunyuanDiT2DModel.set_default_attn_processor with Hunyuan->StableAudio
    def set_default_attn_processor(self):
        """
        Disables custom attention processors and sets the default attention implementation.
        """
        self.set_attn_processor(StableAudioAttnProcessor2_0())

    def _set_gradient_checkpointing(self, module, value=False):
        if hasattr(module, "gradient_checkpointing"):
            module.gradient_checkpointing = value

    def construct(
        self,
        hidden_states: ms.Tensor,
        timestep: ms.Tensor = None,
        encoder_hidden_states: ms.Tensor = None,
        global_hidden_states: ms.Tensor = None,
        rotary_embedding: ms.Tensor = None,
        return_dict: bool = True,
        attention_mask: Optional[ms.Tensor] = None,
        encoder_attention_mask: Optional[ms.Tensor] = None,
    ) -> Union[ms.Tensor, Transformer2DModelOutput]:
        """
        The [`StableAudioDiTModel`] forward method.

        Args:
            hidden_states (`ms.Tensor` of shape `(batch size, in_channels, sequence_len)`):
                Input `hidden_states`.
            timestep ( `ms.Tensor`):
                Used to indicate denoising step.
            encoder_hidden_states (`ms.Tensor` of shape `(batch size, encoder_sequence_len, cross_attention_input_dim)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            global_hidden_states (`ms.Tensor` of shape `(batch size, global_sequence_len, global_states_input_dim)`):
               Global embeddings that will be prepended to the hidden states.
            rotary_embedding (`ms.Tensor`):
                The rotary embeddings to apply on query and key tensors during attention calculation.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.
            attention_mask (`ms.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
                Mask to avoid performing attention on padding token indices, formed by concatenating the attention
                masks
                    for the two text encoders together. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
            encoder_attention_mask (`ms.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
                Mask to avoid performing attention on padding token cross-attention indices, formed by concatenating
                the attention masks
                    for the two text encoders together. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        """
        cross_attention_hidden_states = self.cross_attention_proj(encoder_hidden_states)
        global_hidden_states = self.global_proj(global_hidden_states)
        time_hidden_states = self.timestep_proj(self.time_proj(timestep.to(self.dtype)))

        global_hidden_states = global_hidden_states + time_hidden_states.unsqueeze(1)

        hidden_states = self.preprocess_conv(hidden_states) + hidden_states
        # (batch_size, dim, sequence_length) -> (batch_size, sequence_length, dim)

        hidden_states = hidden_states.transpose(0, 2, 1)
        hidden_states = self.proj_in(hidden_states)

        # prepend global states to hidden states
        hidden_states = ops.cat([global_hidden_states, hidden_states], axis=-2)
        if attention_mask is not None:
            prepend_mask = ops.ones((hidden_states.shape[0], 1), dtype=ms.bool)
            attention_mask = ops.cat([prepend_mask, attention_mask], axis=-1)

        for block in self.transformer_blocks:
            # todo:add recompute
            hidden_states = block(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                encoder_hidden_states=cross_attention_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                rotary_embedding=rotary_embedding,
            )

        hidden_states = self.proj_out(hidden_states)

        # (batch_size, sequence_length, dim) -> (batch_size, dim, sequence_length)
        # remove prepend length that has been added by global hidden states
        hidden_states = hidden_states.transpose(0, 2, 1)[:, :, 1:]
        hidden_states = self.postprocess_conv(hidden_states) + hidden_states

        if not return_dict:
            return (hidden_states,)

        return Transformer2DModelOutput(sample=hidden_states)

`mindone.diffusers.StableAudioDiTModel.attn_processors: Dict[str, AttentionProcessor]` `property` ¶

RETURNS	DESCRIPTION
`Dict[str, AttentionProcessor]`	`dict` of attention processors: A dictionary containing all attention processors used in the model with
`Dict[str, AttentionProcessor]`	indexed by its weight name.

`mindone.diffusers.StableAudioDiTModel.construct(hidden_states, timestep=None, encoder_hidden_states=None, global_hidden_states=None, rotary_embedding=None, return_dict=True, attention_mask=None, encoder_attention_mask=None)` ¶

The [StableAudioDiTModel] forward method.

PARAMETER	DESCRIPTION
`hidden_states`	Input `hidden_states`. TYPE: `ms.Tensor` of shape `(batch size, in_channels, sequence_len)`
`timestep`	Used to indicate denoising step. TYPE: `ms.Tensor` DEFAULT: `None`
`encoder_hidden_states`	Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. TYPE: `ms.Tensor` of shape `(batch size, encoder_sequence_len, cross_attention_input_dim)` DEFAULT: `None`
`global_hidden_states`	Global embeddings that will be prepended to the hidden states. TYPE: `ms.Tensor` of shape `(batch size, global_sequence_len, global_states_input_dim)` DEFAULT: `None`
`rotary_embedding`	The rotary embeddings to apply on query and key tensors during attention calculation. TYPE: `ms.Tensor` DEFAULT: `None`
`return_dict`	Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain tuple. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`attention_mask`	Mask to avoid performing attention on padding token indices, formed by concatenating the attention masks for the two text encoders together. Mask values selected in `[0, 1]`: 1 for tokens that are not masked, 0 for tokens that are masked. TYPE: `ms.Tensor` of shape `(batch_size, sequence_len)`, optional DEFAULT: `None`
`encoder_attention_mask`	Mask to avoid performing attention on padding token cross-attention indices, formed by concatenating the attention masks for the two text encoders together. Mask values selected in `[0, 1]`: 1 for tokens that are not masked, 0 for tokens that are masked. TYPE: `ms.Tensor` of shape `(batch_size, sequence_len)`, optional DEFAULT: `None`

Source code in mindone/diffusers/models/transformers/stable_audio_transformer.py

def construct(
    self,
    hidden_states: ms.Tensor,
    timestep: ms.Tensor = None,
    encoder_hidden_states: ms.Tensor = None,
    global_hidden_states: ms.Tensor = None,
    rotary_embedding: ms.Tensor = None,
    return_dict: bool = True,
    attention_mask: Optional[ms.Tensor] = None,
    encoder_attention_mask: Optional[ms.Tensor] = None,
) -> Union[ms.Tensor, Transformer2DModelOutput]:
    """
    The [`StableAudioDiTModel`] forward method.

    Args:
        hidden_states (`ms.Tensor` of shape `(batch size, in_channels, sequence_len)`):
            Input `hidden_states`.
        timestep ( `ms.Tensor`):
            Used to indicate denoising step.
        encoder_hidden_states (`ms.Tensor` of shape `(batch size, encoder_sequence_len, cross_attention_input_dim)`):
            Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
        global_hidden_states (`ms.Tensor` of shape `(batch size, global_sequence_len, global_states_input_dim)`):
           Global embeddings that will be prepended to the hidden states.
        rotary_embedding (`ms.Tensor`):
            The rotary embeddings to apply on query and key tensors during attention calculation.
        return_dict (`bool`, *optional*, defaults to `True`):
            Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
            tuple.
        attention_mask (`ms.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
            Mask to avoid performing attention on padding token indices, formed by concatenating the attention
            masks
                for the two text encoders together. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        encoder_attention_mask (`ms.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
            Mask to avoid performing attention on padding token cross-attention indices, formed by concatenating
            the attention masks
                for the two text encoders together. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
    Returns:
        If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
        `tuple` where the first element is the sample tensor.
    """
    cross_attention_hidden_states = self.cross_attention_proj(encoder_hidden_states)
    global_hidden_states = self.global_proj(global_hidden_states)
    time_hidden_states = self.timestep_proj(self.time_proj(timestep.to(self.dtype)))

    global_hidden_states = global_hidden_states + time_hidden_states.unsqueeze(1)

    hidden_states = self.preprocess_conv(hidden_states) + hidden_states
    # (batch_size, dim, sequence_length) -> (batch_size, sequence_length, dim)

    hidden_states = hidden_states.transpose(0, 2, 1)
    hidden_states = self.proj_in(hidden_states)

    # prepend global states to hidden states
    hidden_states = ops.cat([global_hidden_states, hidden_states], axis=-2)
    if attention_mask is not None:
        prepend_mask = ops.ones((hidden_states.shape[0], 1), dtype=ms.bool)
        attention_mask = ops.cat([prepend_mask, attention_mask], axis=-1)

    for block in self.transformer_blocks:
        # todo:add recompute
        hidden_states = block(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            encoder_hidden_states=cross_attention_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            rotary_embedding=rotary_embedding,
        )

    hidden_states = self.proj_out(hidden_states)

    # (batch_size, sequence_length, dim) -> (batch_size, dim, sequence_length)
    # remove prepend length that has been added by global hidden states
    hidden_states = hidden_states.transpose(0, 2, 1)[:, :, 1:]
    hidden_states = self.postprocess_conv(hidden_states) + hidden_states

    if not return_dict:
        return (hidden_states,)

    return Transformer2DModelOutput(sample=hidden_states)

`mindone.diffusers.StableAudioDiTModel.set_attn_processor(processor)` ¶

Sets the attention processor to use to compute attention.

PARAMETER	DESCRIPTION
`processor`	The instantiated processor class or a dictionary of processor classes that will be set as the processor for all `Attention` layers. If `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors. TYPE: `dict` of `AttentionProcessor` or only `AttentionProcessor`

Source code in mindone/diffusers/models/transformers/stable_audio_transformer.py

def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
    r"""
    Sets the attention processor to use to compute attention.

    Parameters:
        processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
            The instantiated processor class or a dictionary of processor classes that will be set as the processor
            for **all** `Attention` layers.

            If `processor` is a dict, the key needs to define the path to the corresponding cross attention
            processor. This is strongly recommended when setting trainable attention processors.

    """
    count = len(self.attn_processors.keys())

    if isinstance(processor, dict) and len(processor) != count:
        raise ValueError(
            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
        )

    def fn_recursive_attn_processor(name: str, module: nn.Cell, processor):
        if hasattr(module, "set_processor"):
            if not isinstance(processor, dict):
                module.set_processor(processor)
            else:
                module.set_processor(processor.pop(f"{name}.processor"))

        for sub_name, child in module.named_children():
            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

    for name, module in self.named_children():
        fn_recursive_attn_processor(name, module, processor)

`mindone.diffusers.StableAudioDiTModel.set_default_attn_processor()` ¶

Disables custom attention processors and sets the default attention implementation.

Source code in mindone/diffusers/models/transformers/stable_audio_transformer.py

def set_default_attn_processor(self):
    """
    Disables custom attention processors and sets the default attention implementation.
    """
    self.set_attn_processor(StableAudioAttnProcessor2_0())

StableAudioDiTModel¶

mindone.diffusers.StableAudioDiTModel ¶

mindone.diffusers.StableAudioDiTModel.attn_processors: Dict[str, AttentionProcessor] property ¶

mindone.diffusers.StableAudioDiTModel.construct(hidden_states, timestep=None, encoder_hidden_states=None, global_hidden_states=None, rotary_embedding=None, return_dict=True, attention_mask=None, encoder_attention_mask=None) ¶

mindone.diffusers.StableAudioDiTModel.set_attn_processor(processor) ¶

mindone.diffusers.StableAudioDiTModel.set_default_attn_processor() ¶

`mindone.diffusers.StableAudioDiTModel` ¶

`mindone.diffusers.StableAudioDiTModel.attn_processors: Dict[str, AttentionProcessor]` `property` ¶

`mindone.diffusers.StableAudioDiTModel.construct(hidden_states, timestep=None, encoder_hidden_states=None, global_hidden_states=None, rotary_embedding=None, return_dict=True, attention_mask=None, encoder_attention_mask=None)` ¶

`mindone.diffusers.StableAudioDiTModel.set_attn_processor(processor)` ¶

`mindone.diffusers.StableAudioDiTModel.set_default_attn_processor()` ¶