Skip to content

IP-Adapter

IP-Adapter is a lightweight adapter that enables prompting a diffusion model with an image. This method decouples the cross-attention layers of the image and text features. The image features are generated from an image encoder.

Tip

Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter loading guide, and you can see how to use it in the usage guide.

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin

Mixin for handling IP Adapters.

Source code in mindone/diffusers/loaders/ip_adapter.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
class IPAdapterMixin:
    """Mixin for handling IP Adapters."""

    @validate_hf_hub_args
    def load_ip_adapter(
        self,
        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, mindspore.Tensor]],
        subfolder: Union[str, List[str]],
        weight_name: Union[str, List[str]],
        image_encoder_folder: Optional[str] = "image_encoder",
        **kwargs,
    ):
        """
        Parameters:
            pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
                Can be either:

                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
                      the Hub.
                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
                      with [`ModelMixin.save_pretrained`].
                    - A [mindspore state dict]
            subfolder (`str` or `List[str]`):
                The subfolder location of a model file within a larger model repository on the Hub or locally. If a
                list is passed, it should have the same length as `weight_name`.
            weight_name (`str` or `List[str]`):
                The name of the weight file to load. If a list is passed, it should have the same length as
                `weight_name`.
            image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
                The subfolder location of the image encoder within a larger model repository on the Hub or locally.
                Pass `None` to not load the image encoder. If the image encoder is located in a folder inside
                `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g.
                `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than
                `subfolder`, you should pass the path to the folder that contains image encoder weights, for example,
                `image_encoder_folder="different_subfolder/image_encoder"`.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
                is not used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.

            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            local_files_only (`bool`, *optional*, defaults to `False`):
                Whether to only load local model weights and configuration files or not. If set to `True`, the model
                won't be downloaded from the Hub.
            token (`str` or *bool*, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
                `diffusers-cli login` (stored in `~/.huggingface`) is used.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                allowed by Git.
        """

        # handle the list inputs for multiple IP Adapters
        if not isinstance(weight_name, list):
            weight_name = [weight_name]

        if not isinstance(pretrained_model_name_or_path_or_dict, list):
            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
        if len(pretrained_model_name_or_path_or_dict) == 1:
            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)

        if not isinstance(subfolder, list):
            subfolder = [subfolder]
        if len(subfolder) == 1:
            subfolder = subfolder * len(weight_name)

        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")

        if len(weight_name) != len(subfolder):
            raise ValueError("`weight_name` and `subfolder` must have the same length.")

        # Load the main state dict first.
        cache_dir = kwargs.pop("cache_dir", None)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", None)
        token = kwargs.pop("token", None)
        revision = kwargs.pop("revision", None)

        state_dicts = []
        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
            pretrained_model_name_or_path_or_dict, weight_name, subfolder
        ):
            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
                model_file = _get_model_file(
                    pretrained_model_name_or_path_or_dict,
                    weights_name=weight_name,
                    cache_dir=cache_dir,
                    force_download=force_download,
                    proxies=proxies,
                    local_files_only=local_files_only,
                    token=token,
                    revision=revision,
                    subfolder=subfolder,
                )
                if weight_name.endswith(".safetensors"):
                    state_dict = {"image_proj": {}, "ip_adapter": {}}
                    for key, value in load_file(model_file).items():
                        if key.startswith("image_proj."):
                            state_dict["image_proj"][key.replace("image_proj.", "")] = value
                        elif key.startswith("ip_adapter."):
                            state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = value
                else:
                    raise NotImplementedError(
                        f"Only supports deserialization of weights file in safetensors format, but got {model_file}"
                    )
            else:
                state_dict = pretrained_model_name_or_path_or_dict

            keys = list(state_dict.keys())
            if keys != ["image_proj", "ip_adapter"]:
                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")

            state_dicts.append(state_dict)

            # load CLIP image encoder here if it has not been registered to the pipeline yet
            if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
                if image_encoder_folder is not None:
                    if not isinstance(pretrained_model_name_or_path_or_dict, dict):
                        logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
                        if image_encoder_folder.count("/") == 0:
                            image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
                        else:
                            image_encoder_subfolder = Path(image_encoder_folder).as_posix()

                        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
                            pretrained_model_name_or_path_or_dict,
                            subfolder=image_encoder_subfolder,
                            cache_dir=cache_dir,
                            local_files_only=local_files_only,
                        ).to(self.dtype)
                        self.register_modules(image_encoder=image_encoder)
                    else:
                        raise ValueError(
                            "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
                        )
                else:
                    logger.warning(
                        "image_encoder is not loaded since `image_encoder_folder=None` passed."
                        "You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
                        "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
                    )

            # create feature extractor if it has not been registered to the pipeline yet
            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
                clip_image_size = self.image_encoder.config.image_size
                feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
                self.register_modules(feature_extractor=feature_extractor)

        # load ip-adapter into unet
        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
        unet._load_ip_adapter_weights(state_dicts)

        extra_loras = unet._load_ip_adapter_loras(state_dicts)
        if extra_loras != {}:
            # apply the IP Adapter Face ID LoRA weights
            peft_config = getattr(unet, "peft_config", {})
            for k, lora in extra_loras.items():
                if f"faceid_{k}" not in peft_config:
                    self.load_lora_weights(lora, adapter_name=f"faceid_{k}")
                    self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0])

    def set_ip_adapter_scale(self, scale):
        """
        Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for
        granular control over each IP-Adapter behavior. A config can be a float or a dictionary.

        Example:

        ```py
        # To use original IP-Adapter
        scale = 1.0
        pipeline.set_ip_adapter_scale(scale)

        # To use style block only
        scale = {
            "up": {"block_0": [0.0, 1.0, 0.0]},
        }
        pipeline.set_ip_adapter_scale(scale)

        # To use style+layout blocks
        scale = {
            "down": {"block_2": [0.0, 1.0]},
            "up": {"block_0": [0.0, 1.0, 0.0]},
        }
        pipeline.set_ip_adapter_scale(scale)

        # To use style and layout from 2 reference images
        scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
        pipeline.set_ip_adapter_scale(scales)
        ```
        """
        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
        if not isinstance(scale, list):
            scale = [scale]
        scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0)

        for attn_name, attn_processor in unet.attn_processors.items():
            if isinstance(attn_processor, (IPAdapterAttnProcessor)):
                if len(scale_configs) != len(attn_processor.scale):
                    raise ValueError(
                        f"Cannot assign {len(scale_configs)} scale_configs to "
                        f"{len(attn_processor.scale)} IP-Adapter."
                    )
                elif len(scale_configs) == 1:
                    scale_configs = scale_configs * len(attn_processor.scale)
                for i, scale_config in enumerate(scale_configs):
                    if isinstance(scale_config, dict):
                        for k, s in scale_config.items():
                            if attn_name.startswith(k):
                                attn_processor.scale[i] = s
                    else:
                        attn_processor.scale[i] = scale_config

    def unload_ip_adapter(self):
        """
        Unloads the IP Adapter weights

        Examples:

        ```python
        >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
        >>> pipeline.unload_ip_adapter()
        >>> ...
        ```
        """
        # remove CLIP image encoder
        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
            self.image_encoder = None
            self.register_to_config(image_encoder=[None, None])

        # remove feature extractor only when safety_checker is None as safety_checker uses
        # the feature_extractor later
        if not hasattr(self, "safety_checker"):
            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
                self.feature_extractor = None
                self.register_to_config(feature_extractor=[None, None])

        # remove hidden encoder
        self.unet.encoder_hid_proj = None
        self.unet.config.encoder_hid_dim_type = None

        # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj`
        if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None:
            self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj
            self.unet.text_encoder_hid_proj = None
            self.unet.config.encoder_hid_dim_type = "text_proj"

        # restore original Unet attention processors layers
        attn_procs = {}
        for name, value in self.unet.attn_processors.items():
            attn_processor_class = AttnProcessor()
            attn_procs[name] = attn_processor_class if isinstance(value, IPAdapterAttnProcessor) else value.__class__()
        self.unet.set_attn_processor(attn_procs)

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.load_ip_adapter(pretrained_model_name_or_path_or_dict, subfolder, weight_name, image_encoder_folder='image_encoder', **kwargs)

PARAMETER DESCRIPTION
pretrained_model_name_or_path_or_dict

Can be either:

- A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
  the Hub.
- A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
  with [`ModelMixin.save_pretrained`].
- A [mindspore state dict]

TYPE: `str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`

subfolder

The subfolder location of a model file within a larger model repository on the Hub or locally. If a list is passed, it should have the same length as weight_name.

TYPE: `str` or `List[str]`

weight_name

The name of the weight file to load. If a list is passed, it should have the same length as weight_name.

TYPE: `str` or `List[str]`

image_encoder_folder

The subfolder location of the image encoder within a larger model repository on the Hub or locally. Pass None to not load the image encoder. If the image encoder is located in a folder inside subfolder, you only need to pass the name of the folder that contains image encoder weights, e.g. image_encoder_folder="image_encoder". If the image encoder is located in a folder other than subfolder, you should pass the path to the folder that contains image encoder weights, for example, image_encoder_folder="different_subfolder/image_encoder".

TYPE: `str`, *optional*, defaults to `image_encoder` DEFAULT: 'image_encoder'

cache_dir

Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used.

TYPE: `Union[str, os.PathLike]`, *optional*

force_download

Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist.

TYPE: `bool`, *optional*, defaults to `False`

proxies

A dictionary of proxy servers to use by protocol or endpoint, for example, {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.

TYPE: `Dict[str, str]`, *optional*

local_files_only

Whether to only load local model weights and configuration files or not. If set to True, the model won't be downloaded from the Hub.

TYPE: `bool`, *optional*, defaults to `False`

token

The token to use as HTTP bearer authorization for remote files. If True, the token generated from diffusers-cli login (stored in ~/.huggingface) is used.

TYPE: `str` or *bool*, *optional*

revision

The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier allowed by Git.

TYPE: `str`, *optional*, defaults to `"main"`

Source code in mindone/diffusers/loaders/ip_adapter.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
@validate_hf_hub_args
def load_ip_adapter(
    self,
    pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, mindspore.Tensor]],
    subfolder: Union[str, List[str]],
    weight_name: Union[str, List[str]],
    image_encoder_folder: Optional[str] = "image_encoder",
    **kwargs,
):
    """
    Parameters:
        pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
            Can be either:

                - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
                  the Hub.
                - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
                  with [`ModelMixin.save_pretrained`].
                - A [mindspore state dict]
        subfolder (`str` or `List[str]`):
            The subfolder location of a model file within a larger model repository on the Hub or locally. If a
            list is passed, it should have the same length as `weight_name`.
        weight_name (`str` or `List[str]`):
            The name of the weight file to load. If a list is passed, it should have the same length as
            `weight_name`.
        image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
            The subfolder location of the image encoder within a larger model repository on the Hub or locally.
            Pass `None` to not load the image encoder. If the image encoder is located in a folder inside
            `subfolder`, you only need to pass the name of the folder that contains image encoder weights, e.g.
            `image_encoder_folder="image_encoder"`. If the image encoder is located in a folder other than
            `subfolder`, you should pass the path to the folder that contains image encoder weights, for example,
            `image_encoder_folder="different_subfolder/image_encoder"`.
        cache_dir (`Union[str, os.PathLike]`, *optional*):
            Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
            is not used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
            cached versions if they exist.

        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
        local_files_only (`bool`, *optional*, defaults to `False`):
            Whether to only load local model weights and configuration files or not. If set to `True`, the model
            won't be downloaded from the Hub.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
            `diffusers-cli login` (stored in `~/.huggingface`) is used.
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
            allowed by Git.
    """

    # handle the list inputs for multiple IP Adapters
    if not isinstance(weight_name, list):
        weight_name = [weight_name]

    if not isinstance(pretrained_model_name_or_path_or_dict, list):
        pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
    if len(pretrained_model_name_or_path_or_dict) == 1:
        pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)

    if not isinstance(subfolder, list):
        subfolder = [subfolder]
    if len(subfolder) == 1:
        subfolder = subfolder * len(weight_name)

    if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
        raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")

    if len(weight_name) != len(subfolder):
        raise ValueError("`weight_name` and `subfolder` must have the same length.")

    # Load the main state dict first.
    cache_dir = kwargs.pop("cache_dir", None)
    force_download = kwargs.pop("force_download", False)
    proxies = kwargs.pop("proxies", None)
    local_files_only = kwargs.pop("local_files_only", None)
    token = kwargs.pop("token", None)
    revision = kwargs.pop("revision", None)

    state_dicts = []
    for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
        pretrained_model_name_or_path_or_dict, weight_name, subfolder
    ):
        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
            model_file = _get_model_file(
                pretrained_model_name_or_path_or_dict,
                weights_name=weight_name,
                cache_dir=cache_dir,
                force_download=force_download,
                proxies=proxies,
                local_files_only=local_files_only,
                token=token,
                revision=revision,
                subfolder=subfolder,
            )
            if weight_name.endswith(".safetensors"):
                state_dict = {"image_proj": {}, "ip_adapter": {}}
                for key, value in load_file(model_file).items():
                    if key.startswith("image_proj."):
                        state_dict["image_proj"][key.replace("image_proj.", "")] = value
                    elif key.startswith("ip_adapter."):
                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = value
            else:
                raise NotImplementedError(
                    f"Only supports deserialization of weights file in safetensors format, but got {model_file}"
                )
        else:
            state_dict = pretrained_model_name_or_path_or_dict

        keys = list(state_dict.keys())
        if keys != ["image_proj", "ip_adapter"]:
            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")

        state_dicts.append(state_dict)

        # load CLIP image encoder here if it has not been registered to the pipeline yet
        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
            if image_encoder_folder is not None:
                if not isinstance(pretrained_model_name_or_path_or_dict, dict):
                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
                    if image_encoder_folder.count("/") == 0:
                        image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
                    else:
                        image_encoder_subfolder = Path(image_encoder_folder).as_posix()

                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
                        pretrained_model_name_or_path_or_dict,
                        subfolder=image_encoder_subfolder,
                        cache_dir=cache_dir,
                        local_files_only=local_files_only,
                    ).to(self.dtype)
                    self.register_modules(image_encoder=image_encoder)
                else:
                    raise ValueError(
                        "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
                    )
            else:
                logger.warning(
                    "image_encoder is not loaded since `image_encoder_folder=None` passed."
                    "You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
                    "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
                )

        # create feature extractor if it has not been registered to the pipeline yet
        if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
            clip_image_size = self.image_encoder.config.image_size
            feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
            self.register_modules(feature_extractor=feature_extractor)

    # load ip-adapter into unet
    unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
    unet._load_ip_adapter_weights(state_dicts)

    extra_loras = unet._load_ip_adapter_loras(state_dicts)
    if extra_loras != {}:
        # apply the IP Adapter Face ID LoRA weights
        peft_config = getattr(unet, "peft_config", {})
        for k, lora in extra_loras.items():
            if f"faceid_{k}" not in peft_config:
                self.load_lora_weights(lora, adapter_name=f"faceid_{k}")
                self.set_adapters([f"faceid_{k}"], adapter_weights=[1.0])

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.set_ip_adapter_scale(scale)

Set IP-Adapter scales per-transformer block. Input scale could be a single config or a list of configs for granular control over each IP-Adapter behavior. A config can be a float or a dictionary.

Example:

# To use original IP-Adapter
scale = 1.0
pipeline.set_ip_adapter_scale(scale)

# To use style block only
scale = {
    "up": {"block_0": [0.0, 1.0, 0.0]},
}
pipeline.set_ip_adapter_scale(scale)

# To use style+layout blocks
scale = {
    "down": {"block_2": [0.0, 1.0]},
    "up": {"block_0": [0.0, 1.0, 0.0]},
}
pipeline.set_ip_adapter_scale(scale)

# To use style and layout from 2 reference images
scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
pipeline.set_ip_adapter_scale(scales)
Source code in mindone/diffusers/loaders/ip_adapter.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def set_ip_adapter_scale(self, scale):
    """
    Set IP-Adapter scales per-transformer block. Input `scale` could be a single config or a list of configs for
    granular control over each IP-Adapter behavior. A config can be a float or a dictionary.

    Example:

    ```py
    # To use original IP-Adapter
    scale = 1.0
    pipeline.set_ip_adapter_scale(scale)

    # To use style block only
    scale = {
        "up": {"block_0": [0.0, 1.0, 0.0]},
    }
    pipeline.set_ip_adapter_scale(scale)

    # To use style+layout blocks
    scale = {
        "down": {"block_2": [0.0, 1.0]},
        "up": {"block_0": [0.0, 1.0, 0.0]},
    }
    pipeline.set_ip_adapter_scale(scale)

    # To use style and layout from 2 reference images
    scales = [{"down": {"block_2": [0.0, 1.0]}}, {"up": {"block_0": [0.0, 1.0, 0.0]}}]
    pipeline.set_ip_adapter_scale(scales)
    ```
    """
    unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
    if not isinstance(scale, list):
        scale = [scale]
    scale_configs = _maybe_expand_lora_scales(unet, scale, default_scale=0.0)

    for attn_name, attn_processor in unet.attn_processors.items():
        if isinstance(attn_processor, (IPAdapterAttnProcessor)):
            if len(scale_configs) != len(attn_processor.scale):
                raise ValueError(
                    f"Cannot assign {len(scale_configs)} scale_configs to "
                    f"{len(attn_processor.scale)} IP-Adapter."
                )
            elif len(scale_configs) == 1:
                scale_configs = scale_configs * len(attn_processor.scale)
            for i, scale_config in enumerate(scale_configs):
                if isinstance(scale_config, dict):
                    for k, s in scale_config.items():
                        if attn_name.startswith(k):
                            attn_processor.scale[i] = s
                else:
                    attn_processor.scale[i] = scale_config

mindone.diffusers.loaders.ip_adapter.IPAdapterMixin.unload_ip_adapter()

Unloads the IP Adapter weights

Examples:

>>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
>>> pipeline.unload_ip_adapter()
>>> ...
Source code in mindone/diffusers/loaders/ip_adapter.py
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
def unload_ip_adapter(self):
    """
    Unloads the IP Adapter weights

    Examples:

    ```python
    >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
    >>> pipeline.unload_ip_adapter()
    >>> ...
    ```
    """
    # remove CLIP image encoder
    if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
        self.image_encoder = None
        self.register_to_config(image_encoder=[None, None])

    # remove feature extractor only when safety_checker is None as safety_checker uses
    # the feature_extractor later
    if not hasattr(self, "safety_checker"):
        if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
            self.feature_extractor = None
            self.register_to_config(feature_extractor=[None, None])

    # remove hidden encoder
    self.unet.encoder_hid_proj = None
    self.unet.config.encoder_hid_dim_type = None

    # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj`
    if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None:
        self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj
        self.unet.text_encoder_hid_proj = None
        self.unet.config.encoder_hid_dim_type = "text_proj"

    # restore original Unet attention processors layers
    attn_procs = {}
    for name, value in self.unet.attn_processors.items():
        attn_processor_class = AttnProcessor()
        attn_procs[name] = attn_processor_class if isinstance(value, IPAdapterAttnProcessor) else value.__class__()
    self.unet.set_attn_processor(attn_procs)

mindone.diffusers.image_processor.IPAdapterMaskProcessor

Bases: VaeImageProcessor

Image processor for IP Adapter image masks.

PARAMETER DESCRIPTION
do_resize

Whether to downscale the image's (height, width) dimensions to multiples of vae_scale_factor.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

vae_scale_factor

VAE scale factor. If do_resize is True, the image is automatically resized to multiples of this factor.

TYPE: `int`, *optional*, defaults to `8` DEFAULT: 8

resample

Resampling filter to use when resizing the image.

TYPE: `str`, *optional*, defaults to `lanczos` DEFAULT: 'lanczos'

do_normalize

Whether to normalize the image to [-1,1].

TYPE: `bool`, *optional*, defaults to `False` DEFAULT: False

do_binarize

Whether to binarize the image to 0/1.

TYPE: `bool`, *optional*, defaults to `True` DEFAULT: True

do_convert_grayscale

Whether to convert the images to grayscale format.

TYPE: `bool`, *optional*, defaults to be `True` DEFAULT: True

Source code in mindone/diffusers/image_processor.py
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
class IPAdapterMaskProcessor(VaeImageProcessor):
    """
    Image processor for IP Adapter image masks.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
        vae_scale_factor (`int`, *optional*, defaults to `8`):
            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
        resample (`str`, *optional*, defaults to `lanczos`):
            Resampling filter to use when resizing the image.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether to normalize the image to [-1,1].
        do_binarize (`bool`, *optional*, defaults to `True`):
            Whether to binarize the image to 0/1.
        do_convert_grayscale (`bool`, *optional*, defaults to be `True`):
            Whether to convert the images to grayscale format.

    """

    config_name = CONFIG_NAME

    @register_to_config
    def __init__(
        self,
        do_resize: bool = True,
        vae_scale_factor: int = 8,
        resample: str = "lanczos",
        do_normalize: bool = False,
        do_binarize: bool = True,
        do_convert_grayscale: bool = True,
    ):
        super().__init__(
            do_resize=do_resize,
            vae_scale_factor=vae_scale_factor,
            resample=resample,
            do_normalize=do_normalize,
            do_binarize=do_binarize,
            do_convert_grayscale=do_convert_grayscale,
        )

    @staticmethod
    def downsample(mask: ms.Tensor, batch_size: int, num_queries: int, value_embed_dim: int):
        """
        Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the
        aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.

        Args:
            mask (`ms.Tensor`):
                The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`.
            batch_size (`int`):
                The batch size.
            num_queries (`int`):
                The number of queries.
            value_embed_dim (`int`):
                The dimensionality of the value embeddings.

        Returns:
            `ms.Tensor`:
                The downsampled mask tensor.

        """
        o_h = mask.shape[1]
        o_w = mask.shape[2]
        ratio = o_w / o_h
        mask_h = int(math.sqrt(num_queries / ratio))
        mask_h = int(mask_h) + int((num_queries % int(mask_h)) != 0)
        mask_w = num_queries // mask_h

        mask_downsample = ops.interpolate(mask.unsqueeze(0), size=(mask_h, mask_w), mode="bicubic").squeeze(0)

        # Repeat batch_size times
        if mask_downsample.shape[0] < batch_size:
            mask_downsample = mask_downsample.tile((batch_size, 1, 1))

        mask_downsample = mask_downsample.view(mask_downsample.shape[0], -1)

        downsampled_area = mask_h * mask_w
        # If the output image and the mask do not have the same aspect ratio, tensor shapes will not match
        # Pad tensor if downsampled_mask.shape[1] is smaller than num_queries
        if downsampled_area < num_queries:
            mask_downsample = ops.Pad(paddings=((0, 0), (0, num_queries - mask_downsample.shape[1])))(mask_downsample)
        # Discard last embeddings if downsampled_mask.shape[1] is bigger than num_queries
        if downsampled_area > num_queries:
            mask_downsample = mask_downsample[:, :num_queries]

        # Repeat last dimension to match SDPA output shape
        mask_downsample = mask_downsample.view(mask_downsample.shape[0], mask_downsample.shape[1], 1).tile(
            (1, 1, value_embed_dim)
        )

        return mask_downsample

mindone.diffusers.image_processor.IPAdapterMaskProcessor.downsample(mask, batch_size, num_queries, value_embed_dim) staticmethod

Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.

PARAMETER DESCRIPTION
mask

The input mask tensor generated with IPAdapterMaskProcessor.preprocess().

TYPE: `ms.Tensor`

batch_size

The batch size.

TYPE: `int`

num_queries

The number of queries.

TYPE: `int`

value_embed_dim

The dimensionality of the value embeddings.

TYPE: `int`

RETURNS DESCRIPTION

ms.Tensor: The downsampled mask tensor.

Source code in mindone/diffusers/image_processor.py
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
@staticmethod
def downsample(mask: ms.Tensor, batch_size: int, num_queries: int, value_embed_dim: int):
    """
    Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention. If the
    aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.

    Args:
        mask (`ms.Tensor`):
            The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`.
        batch_size (`int`):
            The batch size.
        num_queries (`int`):
            The number of queries.
        value_embed_dim (`int`):
            The dimensionality of the value embeddings.

    Returns:
        `ms.Tensor`:
            The downsampled mask tensor.

    """
    o_h = mask.shape[1]
    o_w = mask.shape[2]
    ratio = o_w / o_h
    mask_h = int(math.sqrt(num_queries / ratio))
    mask_h = int(mask_h) + int((num_queries % int(mask_h)) != 0)
    mask_w = num_queries // mask_h

    mask_downsample = ops.interpolate(mask.unsqueeze(0), size=(mask_h, mask_w), mode="bicubic").squeeze(0)

    # Repeat batch_size times
    if mask_downsample.shape[0] < batch_size:
        mask_downsample = mask_downsample.tile((batch_size, 1, 1))

    mask_downsample = mask_downsample.view(mask_downsample.shape[0], -1)

    downsampled_area = mask_h * mask_w
    # If the output image and the mask do not have the same aspect ratio, tensor shapes will not match
    # Pad tensor if downsampled_mask.shape[1] is smaller than num_queries
    if downsampled_area < num_queries:
        mask_downsample = ops.Pad(paddings=((0, 0), (0, num_queries - mask_downsample.shape[1])))(mask_downsample)
    # Discard last embeddings if downsampled_mask.shape[1] is bigger than num_queries
    if downsampled_area > num_queries:
        mask_downsample = mask_downsample[:, :num_queries]

    # Repeat last dimension to match SDPA output shape
    mask_downsample = mask_downsample.view(mask_downsample.shape[0], mask_downsample.shape[1], 1).tile(
        (1, 1, value_embed_dim)
    )

    return mask_downsample