Added adapter modules for text encoders and direct vision

2026-02-08 06:29:56 +00:00 · 2024-02-12 08:46:18 -07:00
parent e074058faa
commit 4ec4025cbb
7 changed files with 645 additions and 27 deletions
--- a/extensions_built_in/sd_trainer/SDTrainer.py
+++ b/extensions_built_in/sd_trainer/SDTrainer.py
@@ -698,7 +698,8 @@ class SDTrainer(BaseSDTrainProcess):
        can_disable_adapter = False
        was_adapter_active = False
        if self.adapter is not None and (isinstance(self.adapter, IPAdapter) or
-                                         isinstance(self.adapter, ReferenceAdapter)
+                                         isinstance(self.adapter, ReferenceAdapter) or
+                                         (isinstance(self.adapter, CustomAdapter))
        ):
            can_disable_adapter = True
            was_adapter_active = self.adapter.is_active
--- a/toolkit/config_modules.py
+++ b/toolkit/config_modules.py
@@ -177,6 +177,10 @@ class AdapterConfig:
            else:
                self.clip_layer = 'last_hidden_state'

+        # text encoder
+        self.text_encoder_path: str = kwargs.get('text_encoder_path', None)
+        self.text_encoder_arch: str = kwargs.get('text_encoder_arch', 'clip')  # clip t5
+

 class EmbeddingConfig:
    def __init__(self, **kwargs):
--- a/toolkit/custom_adapter.py
+++ b/toolkit/custom_adapter.py
@@ -3,11 +3,14 @@ import sys

 from PIL import Image
 from torch.nn import Parameter
-from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, CLIPTextModel, \
+    CLIPTokenizer, T5Tokenizer

 from toolkit.models.clip_fusion import CLIPFusionModule
 from toolkit.models.clip_pre_processor import CLIPImagePreProcessor
 from toolkit.models.ilora import InstantLoRAModule
+from toolkit.models.te_adapter import TEAdapter
+from toolkit.models.vd_adapter import VisionDirectAdapter
 from toolkit.paths import REPOS_ROOT
 from toolkit.photomaker import PhotoMakerIDEncoder, FuseModule, PhotoMakerCLIPEncoder
 from toolkit.saving import load_ip_adapter_model
@@ -77,6 +80,13 @@ class CustomAdapter(torch.nn.Module):
        self.clip_fusion_module: CLIPFusionModule = None
        self.ilora_module: InstantLoRAModule = None

+        self.te: Union[T5EncoderModel, CLIPTextModel] = None
+        self.tokenizer: CLIPTokenizer = None
+        self.te_adapter: TEAdapter = None
+        self.vd_adapter: VisionDirectAdapter = None
+        self.conditional_embeds: Optional[torch.Tensor] = None
+        self.unconditional_embeds: Optional[torch.Tensor] = None
+
        self.setup_adapter()

        if self.adapter_type == 'photo_maker':
@@ -117,6 +127,23 @@ class CustomAdapter(torch.nn.Module):
                vision_hidden_size=self.vision_encoder.config.hidden_size,
                sd=self.sd_ref()
            )
+        elif self.adapter_type == 'text_encoder':
+            if self.config.text_encoder_arch == 't5':
+                self.te = T5EncoderModel.from_pretrained(self.config.text_encoder_path).to(self.sd_ref().unet.device,
+                                                                                           dtype=get_torch_dtype(
+                                                                                               self.sd_ref().dtype))
+                self.tokenizer = T5Tokenizer.from_pretrained(self.config.text_encoder_path)
+            elif self.config.text_encoder_arch == 'clip':
+                self.te = CLIPTextModel.from_pretrained(self.config.text_encoder_path).to(self.sd_ref().unet.device,
+                                                                                          dtype=get_torch_dtype(
+                                                                                              self.sd_ref().dtype))
+                self.tokenizer = CLIPTokenizer.from_pretrained(self.config.text_encoder_path)
+            else:
+                raise ValueError(f"unknown text encoder arch: {self.config.text_encoder_arch}")
+
+            self.te_adapter = TEAdapter(self, self.sd_ref(), self.te, self.tokenizer)
+        elif self.adapter_type == 'vision_direct':
+            self.vd_adapter = VisionDirectAdapter(self, self.sd_ref(), self.vision_encoder)
        else:
            raise ValueError(f"unknown adapter type: {self.adapter_type}")

@@ -148,6 +175,8 @@ class CustomAdapter(torch.nn.Module):
    def setup_clip(self):
        adapter_config = self.config
        sd = self.sd_ref()
+        if self.config.type == "text_encoder":
+            return
        if self.config.type == 'photo_maker':
            try:
                self.image_processor = CLIPImageProcessor.from_pretrained(self.config.image_encoder_path)
@@ -298,6 +327,12 @@ class CustomAdapter(torch.nn.Module):
                            raise ValueError(f"unknown shape: {v.shape}")
                    self.fuse_module.load_state_dict(current_state_dict, strict=strict)

+        if 'te_adapter' in state_dict:
+            self.te_adapter.load_state_dict(state_dict['te_adapter'], strict=strict)
+
+        if 'vd_adapter' in state_dict:
+            self.vd_adapter.load_state_dict(state_dict['vd_adapter'], strict=strict)
+
        if 'vision_encoder' in state_dict and self.config.train_image_encoder:
            self.vision_encoder.load_state_dict(state_dict['vision_encoder'], strict=strict)

@@ -325,6 +360,12 @@ class CustomAdapter(torch.nn.Module):
                state_dict["vision_encoder"] = self.vision_encoder.state_dict()
            state_dict["clip_fusion"] = self.clip_fusion_module.state_dict()
            return state_dict
+        elif self.adapter_type == 'text_encoder':
+            state_dict["te_adapter"] = self.te_adapter.state_dict()
+            return state_dict
+        elif self.adapter_type == 'vision_direct':
+            state_dict["vd_adapter"] = self.vd_adapter.state_dict()
+            return state_dict
        elif self.adapter_type == 'ilora':
            if self.config.train_image_encoder:
                state_dict["vision_encoder"] = self.vision_encoder.state_dict()
@@ -338,7 +379,16 @@ class CustomAdapter(torch.nn.Module):
            prompt: Union[List[str], str],
            is_unconditional: bool = False,
    ):
-        if self.adapter_type == 'clip_fusion' or self.adapter_type == 'ilora':
+        if self.adapter_type == 'clip_fusion' or self.adapter_type == 'ilora' or self.adapter_type == 'vision_direct':
+            return prompt
+        elif self.adapter_type == 'text_encoder':
+            # todo allow for training
+            with torch.no_grad():
+                # encode and save the embeds
+                if is_unconditional:
+                    self.unconditional_embeds = self.te_adapter.encode_text(prompt).detach()
+                else:
+                    self.conditional_embeds = self.te_adapter.encode_text(prompt).detach()
            return prompt
        elif self.adapter_type == 'photo_maker':
            if is_unconditional:
@@ -429,6 +479,9 @@ class CustomAdapter(torch.nn.Module):

                    return prompt

+        else:
+            return prompt
+
    def condition_encoded_embeds(
            self,
            tensors_0_1: torch.Tensor,
@@ -534,11 +587,9 @@ class CustomAdapter(torch.nn.Module):

                        img_embeds = chunk_sum / quad_count

-
                    if not is_training or not self.config.train_image_encoder:
                        img_embeds = img_embeds.detach()

-
                    prompt_embeds.text_embeds = self.clip_fusion_module(
                        prompt_embeds.text_embeds,
                        img_embeds
@@ -547,7 +598,24 @@ class CustomAdapter(torch.nn.Module):


        else:
-            raise NotImplementedError
+            return prompt_embeds
+
+    def get_empty_clip_image(self, batch_size: int) -> torch.Tensor:
+        with torch.no_grad():
+            tensors_0_1 = torch.rand([batch_size, 3, self.input_size, self.input_size], device=self.device)
+            noise_scale = torch.rand([tensors_0_1.shape[0], 1, 1, 1], device=self.device,
+                                     dtype=get_torch_dtype(self.sd_ref().dtype))
+            tensors_0_1 = tensors_0_1 * noise_scale
+            # tensors_0_1 = tensors_0_1 * 0
+            mean = torch.tensor(self.clip_image_processor.image_mean).to(
+                self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
+            ).detach()
+            std = torch.tensor(self.clip_image_processor.image_std).to(
+                self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
+            ).detach()
+            tensors_0_1 = torch.clip((255. * tensors_0_1), 0, 255).round() / 255.0
+            clip_image = (tensors_0_1 - mean.view([1, 3, 1, 1])) / std.view([1, 3, 1, 1])
+        return clip_image.detach()

    def trigger_pre_te(
            self,
@@ -556,22 +624,11 @@ class CustomAdapter(torch.nn.Module):
            has_been_preprocessed=False,
            quad_count=4,
    ) -> PromptEmbeds:
-        if self.adapter_type == 'ilora':
+        if self.adapter_type == 'ilora' or self.adapter_type == 'vision_direct':
            if tensors_0_1 is None:
-                # scale the noise down
-                tensors_0_1 = torch.rand([1, 3, self.input_size, self.input_size], device=self.device)
-                noise_scale = torch.rand([tensors_0_1.shape[0], 1, 1, 1], device=self.device,
-                                         dtype=get_torch_dtype(self.sd_ref().dtype))
-                tensors_0_1 = tensors_0_1 * noise_scale
-                # tensors_0_1 = tensors_0_1 * 0
-                mean = torch.tensor(self.clip_image_processor.image_mean).to(
-                    self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
-                ).detach()
-                std = torch.tensor(self.clip_image_processor.image_std).to(
-                    self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
-                ).detach()
-                tensors_0_1 = torch.clip((255. * tensors_0_1), 0, 255).round() / 255.0
-                clip_image = (tensors_0_1 - mean.view([1, 3, 1, 1])) / std.view([1, 3, 1, 1])
+                tensors_0_1 = self.get_empty_clip_image(1)
+                has_been_preprocessed = True
+
            with torch.no_grad():
                # on training the clip image is created in the dataloader
                if not has_been_preprocessed:
@@ -593,6 +650,15 @@ class CustomAdapter(torch.nn.Module):
                    ).pixel_values
                else:
                    clip_image = tensors_0_1
+
+                batch_size = clip_image.shape[0]
+                if self.adapter_type == 'vision_direct':
+                    # add an unconditional so we can save it
+                    unconditional = self.get_empty_clip_image(batch_size).to(
+                        clip_image.device, dtype=clip_image.dtype
+                    )
+                    clip_image = torch.cat([unconditional, clip_image], dim=0)
+
                clip_image = clip_image.to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype)).detach()

            if self.config.quad_image:
@@ -637,11 +703,36 @@ class CustomAdapter(torch.nn.Module):

                        img_embeds = chunk_sum / quad_count

-
                    if not is_training or not self.config.train_image_encoder:
                        img_embeds = img_embeds.detach()

                    self.ilora_module(img_embeds)
+            if self.adapter_type == 'vision_direct':
+                with torch.set_grad_enabled(is_training):
+                    if is_training and self.config.train_image_encoder:
+                        self.vision_encoder.train()
+                        clip_image = clip_image.requires_grad_(True)
+                    else:
+                        with torch.no_grad():
+                            self.vision_encoder.eval()
+                    clip_output = self.vision_encoder(
+                        clip_image,
+                        output_hidden_states=True,
+                    )
+                    if self.config.clip_layer == 'penultimate_hidden_states':
+                        # they skip last layer for ip+
+                        # https://github.com/tencent-ailab/IP-Adapter/blob/f4b6742db35ea6d81c7b829a55b0a312c7f5a677/tutorial_train_plus.py#L403C26-L403C26
+                        clip_image_embeds = clip_output.hidden_states[-2]
+                    elif self.config.clip_layer == 'last_hidden_state':
+                        clip_image_embeds = clip_output.hidden_states[-1]
+                    else:
+                        clip_image_embeds = clip_output.image_embeds
+
+                    if not is_training or not self.config.train_image_encoder:
+                        clip_image_embeds = clip_image_embeds.detach()
+
+                    # save them to the conditional and unconditional
+                    self.unconditional_embeds, self.conditional_embeds = clip_image_embeds.chunk(2, dim=0)

    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
        if self.config.type == 'photo_maker':
@@ -656,5 +747,11 @@ class CustomAdapter(torch.nn.Module):
            yield from self.ilora_module.parameters(recurse)
            if self.config.train_image_encoder:
                yield from self.vision_encoder.parameters(recurse)
+        elif self.config.type == 'text_encoder':
+            for attn_processor in self.te_adapter.adapter_modules:
+                yield from attn_processor.parameters(recurse)
+        elif self.config.type == 'vision_direct':
+            for attn_processor in self.vd_adapter.adapter_modules:
+                yield from attn_processor.parameters(recurse)
        else:
            raise NotImplementedError
--- a/toolkit/dataloader_mixins.py
+++ b/toolkit/dataloader_mixins.py
@@ -407,11 +407,11 @@ class ImageProcessingDTOMixin:
        w, h = img.size
        if w > h and self.scale_to_width < self.scale_to_height:
            # throw error, they should match
-            raise ValueError(
+            print(
                f"unexpected values: w={w}, h={h}, file_item.scale_to_width={self.scale_to_width}, file_item.scale_to_height={self.scale_to_height}, file_item.path={self.path}")
        elif h > w and self.scale_to_height < self.scale_to_width:
            # throw error, they should match
-            raise ValueError(
+            print(
                f"unexpected values: w={w}, h={h}, file_item.scale_to_width={self.scale_to_width}, file_item.scale_to_height={self.scale_to_height}, file_item.path={self.path}")

        if self.flip_x:
@@ -681,10 +681,12 @@ class ClipImageFileItemDTOMixin:
                self.clip_image_embeds_unconditional = load_file(unconditional_path)

            return
-        img = Image.open(self.clip_image_path).convert('RGB')
        try:
+            img = Image.open(self.clip_image_path).convert('RGB')
            img = exif_transpose(img)
        except Exception as e:
+            # make a random noise image
+            img = Image.new('RGB', (self.dataset_config.resolution, self.dataset_config.resolution))
            print(f"Error: {e}")
            print(f"Error loading image: {self.clip_image_path}")

--- a/toolkit/models/te_adapter.py
+++ b/toolkit/models/te_adapter.py
@@ -0,0 +1,260 @@
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import weakref
+from typing import Union, TYPE_CHECKING
+
+from transformers import T5EncoderModel, CLIPTextModel, CLIPTokenizer, T5Tokenizer
+from toolkit.paths import REPOS_ROOT
+sys.path.append(REPOS_ROOT)
+
+from ipadapter.ip_adapter.attention_processor import AttnProcessor2_0
+
+if TYPE_CHECKING:
+    from toolkit.stable_diffusion_model import StableDiffusion
+    from toolkit.custom_adapter import CustomAdapter
+
+
+class TEAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for Custom TE for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+        adapter
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, adapter=None,
+                 adapter_hidden_size=None):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.adapter_ref: weakref.ref = weakref.ref(adapter)
+
+        self.hidden_size = hidden_size
+        self.adapter_hidden_size = adapter_hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_adapter = nn.Linear(adapter_hidden_size, hidden_size, bias=False)
+        self.to_v_adapter = nn.Linear(adapter_hidden_size, hidden_size, bias=False)
+
+    @property
+    def is_active(self):
+        return self.adapter_ref().is_active
+
+    @property
+    def unconditional_embeds(self):
+        return self.adapter_ref().adapter_ref().unconditional_embeds
+
+    @property
+    def conditional_embeds(self):
+        return self.adapter_ref().adapter_ref().conditional_embeds
+
+    def __call__(
+            self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+    ):
+        is_active = self.adapter_ref().is_active
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        # will be none if disabled
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # only use one TE or the other. If our adapter is active only use ours
+        if self.is_active and self.conditional_embeds is not None:
+            adapter_hidden_states = self.conditional_embeds
+            # check if we are doing unconditional
+            if self.unconditional_embeds is not None and adapter_hidden_states.shape[0] != encoder_hidden_states.shape[0]:
+                # concat unconditional to match the hidden state batch size
+                if self.unconditional_embeds.shape[0] == 1 and adapter_hidden_states.shape[0] != 1:
+                    unconditional = torch.cat([self.unconditional_embeds] * adapter_hidden_states.shape[0], dim=0)
+                else:
+                    unconditional = self.unconditional_embeds
+                adapter_hidden_states = torch.cat([unconditional, adapter_hidden_states], dim=0)
+            # for ip-adapter
+            key = self.to_k_adapter(adapter_hidden_states)
+            value = self.to_v_adapter(adapter_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        try:
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        except RuntimeError:
+            raise RuntimeError(f"key shape: {key.shape}, value shape: {value.shape}")
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class TEAdapter(torch.nn.Module):
+    def __init__(
+            self,
+            adapter: 'CustomAdapter',
+            sd: 'StableDiffusion',
+            te: Union[T5EncoderModel, CLIPTextModel],
+            tokenizer: CLIPTokenizer
+    ):
+        super(TEAdapter, self).__init__()
+        self.adapter_ref: weakref.ref = weakref.ref(adapter)
+        self.sd_ref: weakref.ref = weakref.ref(sd)
+        self.te_ref: weakref.ref = weakref.ref(te)
+        self.tokenizer_ref: weakref.ref = weakref.ref(tokenizer)
+
+        self.token_size = self.te_ref().config.d_model
+
+        # init adapter modules
+        attn_procs = {}
+        unet_sd = sd.unet.state_dict()
+        for name in sd.unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else sd.unet.config['cross_attention_dim']
+            if name.startswith("mid_block"):
+                hidden_size = sd.unet.config['block_out_channels'][-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(sd.unet.config['block_out_channels']))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = sd.unet.config['block_out_channels'][block_id]
+            else:
+                # they didnt have this, but would lead to undefined below
+                raise ValueError(f"unknown attn processor name: {name}")
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor2_0()
+            else:
+                layer_name = name.split(".processor")[0]
+                to_k_adapter = unet_sd[layer_name + ".to_k.weight"]
+                to_v_adapter = unet_sd[layer_name + ".to_v.weight"]
+
+                # add zero padding to the adapter
+                if to_k_adapter.shape[1] < self.token_size:
+                    to_k_adapter = torch.cat([
+                        to_k_adapter,
+                        torch.randn(to_k_adapter.shape[0], self.token_size - to_k_adapter.shape[1]).to(
+                            to_k_adapter.device, dtype=to_k_adapter.dtype) * 0.01
+                    ],
+                        dim=1
+                    )
+                    to_v_adapter = torch.cat([
+                        to_v_adapter,
+                        torch.randn(to_v_adapter.shape[0], self.token_size - to_v_adapter.shape[1]).to(
+                            to_k_adapter.device, dtype=to_k_adapter.dtype) * 0.01
+                    ],
+                        dim=1
+                    )
+                elif to_k_adapter.shape[1] > self.token_size:
+                    to_k_adapter = to_k_adapter[:, :self.token_size]
+                    to_v_adapter = to_v_adapter[:, :self.token_size]
+                else:
+                    to_k_adapter = to_k_adapter
+                    to_v_adapter = to_v_adapter
+
+                # todo resize to the TE hidden size
+                weights = {
+                    "to_k_adapter.weight": to_k_adapter,
+                    "to_v_adapter.weight": to_v_adapter,
+                }
+
+                attn_procs[name] = TEAdapterAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    num_tokens=self.adapter_ref().config.num_tokens,
+                    adapter=self,
+                    adapter_hidden_size=self.token_size
+                )
+                attn_procs[name].load_state_dict(weights)
+        sd.unet.set_attn_processor(attn_procs)
+        self.adapter_modules = torch.nn.ModuleList(sd.unet.attn_processors.values())
+
+    # make a getter to see if is active
+    @property
+    def is_active(self):
+        return self.adapter_ref().is_active
+
+    def encode_text(self, text):
+        te: T5EncoderModel = self.te_ref()
+        tokenizer: T5Tokenizer = self.tokenizer_ref()
+
+        input_ids = tokenizer(
+            text,
+            max_length=77,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.to(te.device)
+        outputs = te(input_ids=input_ids)
+        return outputs.last_hidden_state
+
+    def forward(self, input):
+        return input
--- a/toolkit/models/vd_adapter.py
+++ b/toolkit/models/vd_adapter.py
@@ -0,0 +1,253 @@
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import weakref
+from typing import Union, TYPE_CHECKING
+
+from transformers import T5EncoderModel, CLIPTextModel, CLIPTokenizer, T5Tokenizer, CLIPVisionModelWithProjection
+from toolkit.paths import REPOS_ROOT
+sys.path.append(REPOS_ROOT)
+
+from ipadapter.ip_adapter.attention_processor import AttnProcessor2_0
+
+if TYPE_CHECKING:
+    from toolkit.stable_diffusion_model import StableDiffusion
+    from toolkit.custom_adapter import CustomAdapter
+
+
+class VisionDirectAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for Custom TE for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        adapter
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, adapter=None,
+                 adapter_hidden_size=None):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.adapter_ref: weakref.ref = weakref.ref(adapter)
+
+        self.hidden_size = hidden_size
+        self.adapter_hidden_size = adapter_hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+
+        self.to_k_adapter = nn.Linear(adapter_hidden_size, hidden_size, bias=False)
+        self.to_v_adapter = nn.Linear(adapter_hidden_size, hidden_size, bias=False)
+
+    @property
+    def is_active(self):
+        return self.adapter_ref().is_active
+
+    @property
+    def unconditional_embeds(self):
+        return self.adapter_ref().adapter_ref().unconditional_embeds
+
+    @property
+    def conditional_embeds(self):
+        return self.adapter_ref().adapter_ref().conditional_embeds
+
+    def __call__(
+            self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+    ):
+        is_active = self.adapter_ref().is_active
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        # will be none if disabled
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # only use one TE or the other. If our adapter is active only use ours
+        if self.is_active and self.conditional_embeds is not None:
+
+            adapter_hidden_states = self.conditional_embeds
+            if adapter_hidden_states.shape[0] < batch_size:
+                adapter_hidden_states = torch.cat([
+                    self.unconditional_embeds,
+                    adapter_hidden_states
+                ])
+            # conditional_batch_size = adapter_hidden_states.shape[0]
+            # conditional_query = query
+
+            # for ip-adapter
+            vd_key = self.to_k_adapter(adapter_hidden_states)
+            vd_value = self.to_v_adapter(adapter_hidden_states)
+
+            vd_key = vd_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            vd_value = vd_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+            # TODO: add support for attn.scale when we move to Torch 2.1
+            vd_hidden_states = F.scaled_dot_product_attention(
+                query, vd_key, vd_value, attn_mask=None, dropout_p=0.0, is_causal=False
+            )
+
+            vd_hidden_states = vd_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+            vd_hidden_states = vd_hidden_states.to(query.dtype)
+
+            hidden_states = hidden_states + self.scale * vd_hidden_states
+
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class VisionDirectAdapter(torch.nn.Module):
+    def __init__(
+            self,
+            adapter: 'CustomAdapter',
+            sd: 'StableDiffusion',
+            vision_model: Union[CLIPVisionModelWithProjection],
+    ):
+        super(VisionDirectAdapter, self).__init__()
+        self.adapter_ref: weakref.ref = weakref.ref(adapter)
+        self.sd_ref: weakref.ref = weakref.ref(sd)
+        self.vision_model_ref: weakref.ref = weakref.ref(vision_model)
+
+        self.token_size = vision_model.config.hidden_size
+
+        # init adapter modules
+        attn_procs = {}
+        unet_sd = sd.unet.state_dict()
+        for name in sd.unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else sd.unet.config['cross_attention_dim']
+            if name.startswith("mid_block"):
+                hidden_size = sd.unet.config['block_out_channels'][-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(sd.unet.config['block_out_channels']))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = sd.unet.config['block_out_channels'][block_id]
+            else:
+                # they didnt have this, but would lead to undefined below
+                raise ValueError(f"unknown attn processor name: {name}")
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor2_0()
+            else:
+                layer_name = name.split(".processor")[0]
+                to_k_adapter = unet_sd[layer_name + ".to_k.weight"]
+                to_v_adapter = unet_sd[layer_name + ".to_v.weight"]
+
+                # add zero padding to the adapter
+                if to_k_adapter.shape[1] < self.token_size:
+                    to_k_adapter = torch.cat([
+                        to_k_adapter,
+                        torch.randn(to_k_adapter.shape[0], self.token_size - to_k_adapter.shape[1]).to(
+                            to_k_adapter.device, dtype=to_k_adapter.dtype) * 0.01
+                    ],
+                        dim=1
+                    )
+                    to_v_adapter = torch.cat([
+                        to_v_adapter,
+                        torch.randn(to_v_adapter.shape[0], self.token_size - to_v_adapter.shape[1]).to(
+                            to_k_adapter.device, dtype=to_k_adapter.dtype) * 0.01
+                    ],
+                        dim=1
+                    )
+                elif to_k_adapter.shape[1] > self.token_size:
+                    to_k_adapter = to_k_adapter[:, :self.token_size]
+                    to_v_adapter = to_v_adapter[:, :self.token_size]
+                else:
+                    to_k_adapter = to_k_adapter
+                    to_v_adapter = to_v_adapter
+
+                # todo resize to the TE hidden size
+                weights = {
+                    "to_k_adapter.weight": to_k_adapter,
+                    "to_v_adapter.weight": to_v_adapter,
+                }
+
+                attn_procs[name] = VisionDirectAdapterAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    adapter=self,
+                    adapter_hidden_size=self.token_size
+                )
+                attn_procs[name].load_state_dict(weights)
+        sd.unet.set_attn_processor(attn_procs)
+        self.adapter_modules = torch.nn.ModuleList(sd.unet.attn_processors.values())
+
+    # make a getter to see if is active
+    @property
+    def is_active(self):
+        return self.adapter_ref().is_active
+
+    def forward(self, input):
+        return input
--- a/toolkit/stable_diffusion_model.py
+++ b/toolkit/stable_diffusion_model.py
@@ -478,6 +478,7 @@ class StableDiffusion:
                    gen_config = image_configs[i]

                    extra = {}
+                    validation_image = None
                    if self.adapter is not None and gen_config.adapter_image_path is not None:
                        validation_image = Image.open(gen_config.adapter_image_path).convert("RGB")
                        if isinstance(self.adapter, T2IAdapter):
@@ -528,7 +529,7 @@ class StableDiffusion:
                        )
                        gen_config.negative_prompt_2 = gen_config.negative_prompt

-                    if self.adapter is not None and isinstance(self.adapter, CustomAdapter):
+                    if self.adapter is not None and isinstance(self.adapter, CustomAdapter) and validation_image is not None:
                        self.adapter.trigger_pre_te(
                            tensors_0_1=validation_image,
                            is_training=False,
@@ -559,7 +560,7 @@ class StableDiffusion:
                        conditional_embeds = self.adapter(conditional_embeds, conditional_clip_embeds)
                        unconditional_embeds = self.adapter(unconditional_embeds, unconditional_clip_embeds)

-                    if self.adapter is not None and isinstance(self.adapter, CustomAdapter):
+                    if self.adapter is not None and isinstance(self.adapter, CustomAdapter) and validation_image is not None:
                        conditional_embeds = self.adapter.condition_encoded_embeds(
                            tensors_0_1=validation_image,
                            prompt_embeds=conditional_embeds,