IP-Adapter¶

IP-Adapter is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder.

Tip

Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter loading guide, and you can see how to use it in the usage guide.

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin` ¶

Mixin for handling IP Adapters.

Source code in mindone/diffusers/loaders/ip_adapter.py

class IPAdapterMixin:
    """Mixin for handling IP Adapters."""

    @validate_hf_hub_args
    def load_ip_adapter(
        self,
        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, mindspore.Tensor]],
        subfolder: Union[str, List[str]],
        weight_name: Union[str, List[str]],
        image_encoder_folder: Optional[str] = "image_encoder",
        **kwargs,
    ):
        """
        Parameters:
            pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
                Can be either:

                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
                      the Hub.
                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
                      with [`ModelMixin.save_pretrained`].
                    - A [mindspore state dict]
            subfolder (`str` or `List[str]`):
                The subfolder location of a model file within a larger model repository on the Hub or locally. If a
                list is passed, it should have the same length as `weight_name`.
            weight_name (`str` or `List[str]`):
                The name of the weight file to load. If a list is passed, it should have the same length as
                `subfolder`.
            image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
                The subfolder location of the image encoder within a larger model repository on the Hub or locally.
                Pass `None` to not load the image encoder. If the image encoder is located in a folder inside
                `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g.
                `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than
                `subfolder`, you should pass the path to the folder that contains image encoder weights, for example,
                `image_encoder_folder="different_subfolder/image_encoder"`.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.

            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            local_files_only (`bool`, *optional*, defaults to `False`):
                Whether to only load local model weights and configuration files or not. If set to `True`, the model
                won't be downloaded from the Hub.
            token (`str` or *bool*, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
                `diffusers-cli login` (stored in `~/.huggingface`) is used.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                allowed by Git.
        """

        # handle the list inputs for multiple IP Adapters
        if not isinstance(weight_name, list):
            weight_name = [weight_name]

        if not isinstance(pretrained_model_name_or_path_or_dict, list):
            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
        if len(pretrained_model_name_or_path_or_dict) == 1:
            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)

        if not isinstance(subfolder, list):
            subfolder = [subfolder]
        if len(subfolder) == 1:
            subfolder = subfolder * len(weight_name)

        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")

        if len(weight_name) != len(subfolder):
            raise ValueError("`weight_name` and `subfolder` must have the same length.")

        # Load the main state dict first.
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)

        state_dicts = []
        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
            pretrained_model_name_or_path_or_dict, weight_name, subfolder
        ):
            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
                model_file = _get_model_file(
                    pretrained_model_name_or_path_or_dict,
                    weights_name=weight_name,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    local_files_only=local_files_only,
                    token=token,
                    revision=revision,
                    subfolder=subfolder,
                )
                if weight_name.endswith(".safetensors"):
                    state_dict = {"image_proj": {}, "ip_adapter": {}}
                    for key, value in load_file(model_file).items():
                        if key.startswith("image_proj."):
                            state_dict["image_proj"][key.replace("image_proj.", "")] = value
                        elif key.startswith("ip_adapter."):
                            state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = value
                else:
                    raise NotImplementedError(
                        f"Only supports deserialization of weights file in safetensors format, but got {model_file}"
                    )
            else:
                state_dict = pretrained_model_name_or_path_or_dict

            keys = list(state_dict.keys())
            if "image_proj" not in keys and "ip_adapter" not in keys:
                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")

            state_dicts.append(state_dict)

            # load CLIP image encoder here if it has not been registered to the pipeline yet
            if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
                if image_encoder_folder is not None:
                    if not isinstance(pretrained_model_name_or_path_or_dict, dict):
                        logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
                        if image_encoder_folder.count("/") == 0:
                            image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
                        else:
                            image_encoder_subfolder = Path(image_encoder_folder).as_posix()

                        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
                            pretrained_model_name_or_path_or_dict,
                            subfolder=image_encoder_subfolder,
                            cache_dir=cache_dir,
                            local_files_only=local_files_only,
                            mindspore_dtype=self.dtype,
                        )
                        self.register_modules(image_encoder=image_encoder)
                    else:
                        raise ValueError(
                            "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
                        )
                else:
                    logger.warning(
                        "image_encoder is not loaded since `image_encoder_folder=None` passed."
                        "You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
                        "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
                    )

            # create feature extractor if it has not been registered to the pipeline yet
            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
                # FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
                default_clip_size = 224
                clip_image_size = (
                    self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
                )
                feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
                self.register_modules(feature_extractor=feature_extractor)

        # load ip-adapter into unet
        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
        unet._load_ip_adapter_weights(state_dicts)

        extra_loras = unet._load_ip_adapter_loras(state_dicts)
        if extra_loras != {}:
            # apply the IP Adapter Face ID LoRA weights
            peft_config = getattr(unet, "peft_config", {})
            for k, lora in extra_loras.items():
                if f"faceid_{k}" not in peft_config:
                    self.load_lora_weights(lora, adapter_name=f"faceid_{k}")
                    self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0])

    def set_ip_adapter_scale(self, scale):
        """
        Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for
        granular control over each IP-Adapter behavior. A config can be a float or a dictionary.

        Example:

        ```py
        # To use original IP-Adapter
        scale = 1.0
        pipeline.set_ip_adapter_scale(scale)

        # To use style block only
        scale = {
            "up": {"block_0": [0.0, 1.0, 0.0]},
        }
        pipeline.set_ip_adapter_scale(scale)

        # To use style+layout blocks
        scale = {
            "down": {"block_2": [0.0, 1.0]},
            "up": {"block_0": [0.0, 1.0, 0.0]},
        }
        pipeline.set_ip_adapter_scale(scale)

        # To use style and layout from 2 reference images
        scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
        pipeline.set_ip_adapter_scale(scales)
        ```
        """
        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
        if not isinstance(scale, list):
            scale = [scale]
        scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0)

        for attn_name, attn_processor in unet.attn_processors.items():
            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
                if len(scale_configs) != len(attn_processor.scale):
                    raise ValueError(
                        f"Cannot assign {len(scale_configs)} scale_configs to {len(attn_processor.scale)} IP-Adapter."
                    )
                elif len(scale_configs) == 1:
                    scale_configs = scale_configs * len(attn_processor.scale)
                for i, scale_config in enumerate(scale_configs):
                    if isinstance(scale_config, dict):
                        for k, s in scale_config.items():
                            if attn_name.startswith(k):
                                attn_processor.scale[i] = s
                    else:
                        attn_processor.scale[i] = scale_config

    def unload_ip_adapter(self):
        """
        Unloads the IP Adapter weights

        Examples:

        ```python
        >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
        >>> pipeline.unload_ip_adapter()
        >>> ...
        ```
        """
        # remove CLIP image encoder
        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
            self.image_encoder = None
            self.register_to_config(image_encoder=[None, None])

        # remove feature extractor only when safety_checker is None as safety_checker uses
        # the feature_extractor later
        if not hasattr(self, "safety_checker"):
            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
                self.feature_extractor = None
                self.register_to_config(feature_extractor=[None, None])

        # remove hidden encoder
        self.unet.encoder_hid_proj = None
        self.unet.config.encoder_hid_dim_type = None

        # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj`
        if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None:
            self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj
            self.unet.text_encoder_hid_proj = None
            self.unet.config.encoder_hid_dim_type = "text_proj"

        # restore original Unet attention processors layers
        attn_procs = {}
        for name, value in self.unet.attn_processors.items():
            attn_processor_class = AttnProcessor2_0()
            attn_procs[name] = (
                attn_processor_class
                if isinstance(value, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0))
                else value.__class__()
            )
        self.unet.set_attn_processor(attn_procs)

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.load_ip_adapter(pretrained_model_name_or_path_or_dict, subfolder, weight_name, image_encoder_folder='image_encoder', **kwargs)` ¶

PARAMETER	DESCRIPTION
`pretrained_model_name_or_path_or_dict`	Can be either: - A string, the model id (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on the Hub. - A path to a directory (for example `./my_model_directory`) containing the model weights saved with [`ModelMixin.save_pretrained`]. - A [mindspore state dict] TYPE: `str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`
`subfolder`	The subfolder location of a model file within a larger model repository on the Hub or locally. If a list is passed, it should have the same length as `weight_name`. TYPE: `str` or `List[str]`
`weight_name`	The name of the weight file to load. If a list is passed, it should have the same length as `subfolder`. TYPE: `str` or `List[str]`
`image_encoder_folder`	The subfolder location of the image encoder within a larger model repository on the Hub or locally. Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights, for example, `image_encoder_folder="different_subfolder/image_encoder"`. TYPE: `str`, optional, defaults to `image_encoder` DEFAULT: `'image_encoder'`
`cache_dir`	Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. TYPE: `Union[str, os.PathLike]`, optional
`force_download`	Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. TYPE: `bool`, optional, defaults to `False`
`proxies`	A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. TYPE: `Dict[str, str]`, optional
`local_files_only`	Whether to only load local model weights and configuration files or not. If set to `True`, the model won't be downloaded from the Hub. TYPE: `bool`, optional, defaults to `False`
`token`	The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from `diffusers-cli login` (stored in `~/.huggingface`) is used. TYPE: `str` or bool, optional
`revision`	The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier allowed by Git. TYPE: `str`, optional, defaults to `"main"`

Source code in mindone/diffusers/loaders/ip_adapter.py

@validate_hf_hub_args
def load_ip_adapter(
    self,
    pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, mindspore.Tensor]],
    subfolder: Union[str, List[str]],
    weight_name: Union[str, List[str]],
    image_encoder_folder: Optional[str] = "image_encoder",
    **kwargs,
):
    """
    Parameters:
        pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
            Can be either:

                - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
                  the Hub.
                - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
                  with [`ModelMixin.save_pretrained`].
                - A [mindspore state dict]
        subfolder (`str` or `List[str]`):
            The subfolder location of a model file within a larger model repository on the Hub or locally. If a
            list is passed, it should have the same length as `weight_name`.
        weight_name (`str` or `List[str]`):
            The name of the weight file to load. If a list is passed, it should have the same length as
            `subfolder`.
        image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
            The subfolder location of the image encoder within a larger model repository on the Hub or locally.
            Pass `None` to not load the image encoder. If the image encoder is located in a folder inside
            `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g.
            `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than
            `subfolder`, you should pass the path to the folder that contains image encoder weights, for example,
            `image_encoder_folder="different_subfolder/image_encoder"`.
        cache_dir (`Union[str, os.PathLike]`, *optional*):
            Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
            is not used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
            cached versions if they exist.

        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
        local_files_only (`bool`, *optional*, defaults to `False`):
            Whether to only load local model weights and configuration files or not. If set to `True`, the model
            won't be downloaded from the Hub.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
            `diffusers-cli login` (stored in `~/.huggingface`) is used.
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
            allowed by Git.
    """

    # handle the list inputs for multiple IP Adapters
    if not isinstance(weight_name, list):
        weight_name = [weight_name]

    if not isinstance(pretrained_model_name_or_path_or_dict, list):
        pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
    if len(pretrained_model_name_or_path_or_dict) == 1:
        pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)

    if not isinstance(subfolder, list):
        subfolder = [subfolder]
    if len(subfolder) == 1:
        subfolder = subfolder * len(weight_name)

    if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
        raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")

    if len(weight_name) != len(subfolder):
        raise ValueError("`weight_name` and `subfolder` must have the same length.")

    # Load the main state dict first.
    cache_dir = kwargs.pop("cache_dir", None)
    force_download = kwargs.pop("force_download", False)
    proxies = kwargs.pop("proxies", None)
    local_files_only = kwargs.pop("local_files_only", None)
    token = kwargs.pop("token", None)
    revision = kwargs.pop("revision", None)

    state_dicts = []
    for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
        pretrained_model_name_or_path_or_dict, weight_name, subfolder
    ):
        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
            model_file = _get_model_file(
                pretrained_model_name_or_path_or_dict,
                weights_name=weight_name,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                local_files_only=local_files_only,
                token=token,
                revision=revision,
                subfolder=subfolder,
            )
            if weight_name.endswith(".safetensors"):
                state_dict = {"image_proj": {}, "ip_adapter": {}}
                for key, value in load_file(model_file).items():
                    if key.startswith("image_proj."):
                        state_dict["image_proj"][key.replace("image_proj.", "")] = value
                    elif key.startswith("ip_adapter."):
                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = value
            else:
                raise NotImplementedError(
                    f"Only supports deserialization of weights file in safetensors format, but got {model_file}"
                )
        else:
            state_dict = pretrained_model_name_or_path_or_dict

        keys = list(state_dict.keys())
        if "image_proj" not in keys and "ip_adapter" not in keys:
            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")

        state_dicts.append(state_dict)

        # load CLIP image encoder here if it has not been registered to the pipeline yet
        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
            if image_encoder_folder is not None:
                if not isinstance(pretrained_model_name_or_path_or_dict, dict):
                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
                    if image_encoder_folder.count("/") == 0:
                        image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
                    else:
                        image_encoder_subfolder = Path(image_encoder_folder).as_posix()

                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
                        pretrained_model_name_or_path_or_dict,
                        subfolder=image_encoder_subfolder,
                        cache_dir=cache_dir,
                        local_files_only=local_files_only,
                        mindspore_dtype=self.dtype,
                    )
                    self.register_modules(image_encoder=image_encoder)
                else:
                    raise ValueError(
                        "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
                    )
            else:
                logger.warning(
                    "image_encoder is not loaded since `image_encoder_folder=None` passed."
                    "You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
                    "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
                )

        # create feature extractor if it has not been registered to the pipeline yet
        if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
            # FaceID IP adapters don't need the image encoder so it's not present, in this case we default to 224
            default_clip_size = 224
            clip_image_size = (
                self.image_encoder.config.image_size if self.image_encoder is not None else default_clip_size
            )
            feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
            self.register_modules(feature_extractor=feature_extractor)

    # load ip-adapter into unet
    unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
    unet._load_ip_adapter_weights(state_dicts)

    extra_loras = unet._load_ip_adapter_loras(state_dicts)
    if extra_loras != {}:
        # apply the IP Adapter Face ID LoRA weights
        peft_config = getattr(unet, "peft_config", {})
        for k, lora in extra_loras.items():
            if f"faceid_{k}" not in peft_config:
                self.load_lora_weights(lora, adapter_name=f"faceid_{k}")
                self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0])

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.set_ip_adapter_scale(scale)` ¶

Set IP-Adapter scales per-transformer block. Input scale could be a single config or a list of configs for granular control over each IP-Adapter behavior. A config can be a float or a dictionary.

Example:

# To use original IP-Adapter
scale = 1.0
pipeline.set_ip_adapter_scale(scale)

# To use style block only
scale = {
    "up": {"block_0": [0.0, 1.0, 0.0]},
}
pipeline.set_ip_adapter_scale(scale)

# To use style+layout blocks
scale = {
    "down": {"block_2": [0.0, 1.0]},
    "up": {"block_0": [0.0, 1.0, 0.0]},
}
pipeline.set_ip_adapter_scale(scale)

# To use style and layout from 2 reference images
scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
pipeline.set_ip_adapter_scale(scales)

Source code in mindone/diffusers/loaders/ip_adapter.py

def set_ip_adapter_scale(self, scale):
    """
    Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for
    granular control over each IP-Adapter behavior. A config can be a float or a dictionary.

    Example:

    ```py
    # To use original IP-Adapter
    scale = 1.0
    pipeline.set_ip_adapter_scale(scale)

    # To use style block only
    scale = {
        "up": {"block_0": [0.0, 1.0, 0.0]},
    }
    pipeline.set_ip_adapter_scale(scale)

    # To use style+layout blocks
    scale = {
        "down": {"block_2": [0.0, 1.0]},
        "up": {"block_0": [0.0, 1.0, 0.0]},
    }
    pipeline.set_ip_adapter_scale(scale)

    # To use style and layout from 2 reference images
    scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
    pipeline.set_ip_adapter_scale(scales)
    ```
    """
    unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
    if not isinstance(scale, list):
        scale = [scale]
    scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0)

    for attn_name, attn_processor in unet.attn_processors.items():
        if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
            if len(scale_configs) != len(attn_processor.scale):
                raise ValueError(
                    f"Cannot assign {len(scale_configs)} scale_configs to {len(attn_processor.scale)} IP-Adapter."
                )
            elif len(scale_configs) == 1:
                scale_configs = scale_configs * len(attn_processor.scale)
            for i, scale_config in enumerate(scale_configs):
                if isinstance(scale_config, dict):
                    for k, s in scale_config.items():
                        if attn_name.startswith(k):
                            attn_processor.scale[i] = s
                else:
                    attn_processor.scale[i] = scale_config

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.unload_ip_adapter()` ¶

Unloads the IP Adapter weights

Examples:

>>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
>>> pipeline.unload_ip_adapter()
>>> ...

Source code in mindone/diffusers/loaders/ip_adapter.py

def unload_ip_adapter(self):
    """
    Unloads the IP Adapter weights

    Examples:

    ```python
    >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
    >>> pipeline.unload_ip_adapter()
    >>> ...
    ```
    """
    # remove CLIP image encoder
    if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
        self.image_encoder = None
        self.register_to_config(image_encoder=[None, None])

    # remove feature extractor only when safety_checker is None as safety_checker uses
    # the feature_extractor later
    if not hasattr(self, "safety_checker"):
        if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
            self.feature_extractor = None
            self.register_to_config(feature_extractor=[None, None])

    # remove hidden encoder
    self.unet.encoder_hid_proj = None
    self.unet.config.encoder_hid_dim_type = None

    # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj`
    if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None:
        self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj
        self.unet.text_encoder_hid_proj = None
        self.unet.config.encoder_hid_dim_type = "text_proj"

    # restore original Unet attention processors layers
    attn_procs = {}
    for name, value in self.unet.attn_processors.items():
        attn_processor_class = AttnProcessor2_0()
        attn_procs[name] = (
            attn_processor_class
            if isinstance(value, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0))
            else value.__class__()
        )
    self.unet.set_attn_processor(attn_procs)

`mindone.diffusers.image_processor.IPAdapterMaskProcessor` ¶

Bases: VaeImageProcessor

Image processor for IP Adapter image masks.

PARAMETER	DESCRIPTION
`do_resize`	Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`vae_scale_factor`	VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor. TYPE: `int`, optional, defaults to `8` DEFAULT: `8`
`resample`	Resampling filter to use when resizing the image. TYPE: `str`, optional, defaults to `lanczos` DEFAULT: `'lanczos'`
`do_normalize`	Whether to normalize the image to [-1,1]. TYPE: `bool`, optional, defaults to `False` DEFAULT: `False`
`do_binarize`	Whether to binarize the image to 0/1. TYPE: `bool`, optional, defaults to `True` DEFAULT: `True`
`do_convert_grayscale`	Whether to convert the images to grayscale format. TYPE: `bool`, optional, defaults to be `True` DEFAULT: `True`

Source code in mindone/diffusers/image_processor.py

class IPAdapterMaskProcessor(VaeImageProcessor):
    """
    Image processor for IP Adapter image masks.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
        vae_scale_factor (`int`, *optional*, defaults to `8`):
            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
        resample (`str`, *optional*, defaults to `lanczos`):
            Resampling filter to use when resizing the image.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether to normalize the image to [-1,1].
        do_binarize (`bool`, *optional*, defaults to `True`):
            Whether to binarize the image to 0/1.
        do_convert_grayscale (`bool`, *optional*, defaults to be `True`):
            Whether to convert the images to grayscale format.

    """

    config_name = CONFIG_NAME

    @register_to_config
    def __init__(
        self,
        do_resize: bool = True,
        vae_scale_factor: int = 8,
        resample: str = "lanczos",
        do_normalize: bool = False,
        do_binarize: bool = True,
        do_convert_grayscale: bool = True,
    ):
        super().__init__(
            do_resize=do_resize,
            vae_scale_factor=vae_scale_factor,
            resample=resample,
            do_normalize=do_normalize,
            do_binarize=do_binarize,
            do_convert_grayscale=do_convert_grayscale,
        )

    @staticmethod
    def downsample(mask: ms.Tensor, batch_size: int, num_queries: int, value_embed_dim: int):
        """
        Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the
        aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.

        Args:
            mask (`ms.Tensor`):
                The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`.
            batch_size (`int`):
                The batch size.
            num_queries (`int`):
                The number of queries.
            value_embed_dim (`int`):
                The dimensionality of the value embeddings.

        Returns:
            `ms.Tensor`:
                The downsampled mask tensor.

        """
        o_h = mask.shape[1]
        o_w = mask.shape[2]
        ratio = o_w / o_h
        mask_h = int(math.sqrt(num_queries / ratio))
        mask_h = int(mask_h) + int((num_queries % int(mask_h)) != 0)
        mask_w = num_queries // mask_h

        mask_downsample = mint.nn.functional.interpolate(
            mask.unsqueeze(0), size=(mask_h, mask_w), mode="bicubic"
        ).squeeze(0)

        # Repeat batch_size times
        if mask_downsample.shape[0] < batch_size:
            mask_downsample = mask_downsample.tile((batch_size, 1, 1))

        mask_downsample = mask_downsample.view(mask_downsample.shape[0], -1)

        downsampled_area = mask_h * mask_w
        # If the output image and the mask do not have the same aspect ratio, tensor shapes will not match
        # Pad tensor if downsampled_mask.shape[1] is smaller than num_queries
        if downsampled_area < num_queries:
            mask_downsample = mint.nn.functional.pad(
                mask_downsample, (0, num_queries - mask_downsample.shape[1]), value=0.0
            )
        # Discard last embeddings if downsampled_mask.shape[1] is bigger than num_queries
        if downsampled_area > num_queries:
            mask_downsample = mask_downsample[:, :num_queries]

        # Repeat last dimension to match SDPA output shape
        mask_downsample = mask_downsample.view(mask_downsample.shape[0], mask_downsample.shape[1], 1).tile(
            (1, 1, value_embed_dim)
        )

        return mask_downsample

`mindone.diffusers.image_processor.IPAdapterMaskProcessor.downsample(mask, batch_size, num_queries, value_embed_dim)` `staticmethod` ¶

Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.

PARAMETER	DESCRIPTION
`mask`	The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`. TYPE: `ms.Tensor`
`batch_size`	The batch size. TYPE: `int`
`num_queries`	The number of queries. TYPE: `int`
`value_embed_dim`	The dimensionality of the value embeddings. TYPE: `int`

RETURNS	DESCRIPTION
	`ms.Tensor`: The downsampled mask tensor.

Source code in mindone/diffusers/image_processor.py

@staticmethod
def downsample(mask: ms.Tensor, batch_size: int, num_queries: int, value_embed_dim: int):
    """
    Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the
    aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.

    Args:
        mask (`ms.Tensor`):
            The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`.
        batch_size (`int`):
            The batch size.
        num_queries (`int`):
            The number of queries.
        value_embed_dim (`int`):
            The dimensionality of the value embeddings.

    Returns:
        `ms.Tensor`:
            The downsampled mask tensor.

    """
    o_h = mask.shape[1]
    o_w = mask.shape[2]
    ratio = o_w / o_h
    mask_h = int(math.sqrt(num_queries / ratio))
    mask_h = int(mask_h) + int((num_queries % int(mask_h)) != 0)
    mask_w = num_queries // mask_h

    mask_downsample = mint.nn.functional.interpolate(
        mask.unsqueeze(0), size=(mask_h, mask_w), mode="bicubic"
    ).squeeze(0)

    # Repeat batch_size times
    if mask_downsample.shape[0] < batch_size:
        mask_downsample = mask_downsample.tile((batch_size, 1, 1))

    mask_downsample = mask_downsample.view(mask_downsample.shape[0], -1)

    downsampled_area = mask_h * mask_w
    # If the output image and the mask do not have the same aspect ratio, tensor shapes will not match
    # Pad tensor if downsampled_mask.shape[1] is smaller than num_queries
    if downsampled_area < num_queries:
        mask_downsample = mint.nn.functional.pad(
            mask_downsample, (0, num_queries - mask_downsample.shape[1]), value=0.0
        )
    # Discard last embeddings if downsampled_mask.shape[1] is bigger than num_queries
    if downsampled_area > num_queries:
        mask_downsample = mask_downsample[:, :num_queries]

    # Repeat last dimension to match SDPA output shape
    mask_downsample = mask_downsample.view(mask_downsample.shape[0], mask_downsample.shape[1], 1).tile(
        (1, 1, value_embed_dim)
    )

    return mask_downsample

IP-Adapter¶

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin ¶

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.load_ip_adapter(pretrained_model_name_or_path_or_dict, subfolder, weight_name, image_encoder_folder='image_encoder', **kwargs) ¶

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.set_ip_adapter_scale(scale) ¶

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.unload_ip_adapter() ¶

mindone.diffusers.image_processor.IPAdapterMaskProcessor ¶

mindone.diffusers.image_processor.IPAdapterMaskProcessor.downsample(mask, batch_size, num_queries, value_embed_dim) staticmethod ¶

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin` ¶

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.load_ip_adapter(pretrained_model_name_or_path_or_dict, subfolder, weight_name, image_encoder_folder='image_encoder', **kwargs)` ¶

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.set_ip_adapter_scale(scale)` ¶

`mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.unload_ip_adapter()` ¶

`mindone.diffusers.image_processor.IPAdapterMaskProcessor` ¶

`mindone.diffusers.image_processor.IPAdapterMaskProcessor.downsample(mask, batch_size, num_queries, value_embed_dim)` `staticmethod` ¶