diff --git a/toolkit/models/ilora.py b/toolkit/models/ilora.py
index c9797495..f292b694 100644
--- a/toolkit/models/ilora.py
+++ b/toolkit/models/ilora.py
@@ -268,6 +268,10 @@ class InstantLoRAModule(torch.nn.Module):
 
         self.output_size = output_size
 
+        number_formatted_output_size = "{:,}".format(output_size)
+
+        print(f" ILORA output size: {number_formatted_output_size}")
+
         # if not evenly divisible, error
         if self.output_size % self.num_heads != 0:
             raise ValueError("Output size must be divisible by the number of heads")
@@ -284,6 +288,7 @@ class InstantLoRAModule(torch.nn.Module):
                 embedding_dim=vision_hidden_size,
                 max_seq_len=vision_tokens,
                 output_dim=head_dim,
+                apply_pos_emb=True,  # this is new
                 ff_mult=4
             )
 
diff --git a/toolkit/models/te_adapter.py b/toolkit/models/te_adapter.py
index ad43f221..cc7679aa 100644
--- a/toolkit/models/te_adapter.py
+++ b/toolkit/models/te_adapter.py
@@ -382,6 +382,7 @@ class TEAdapter(torch.nn.Module):
     def encode_text(self, text):
         te: T5EncoderModel = self.te_ref()
         tokenizer: T5Tokenizer = self.tokenizer_ref()
+        attn_mask_float = None
 
         # input_ids = tokenizer(
         #     text,
@@ -424,13 +425,18 @@ class TEAdapter(torch.nn.Module):
             attn_mask_float = attention_mask.to(embeds.device, dtype=embeds.dtype)
         if self.text_projection is not None:
             # pool the output of embeds ignoring 0 in the attention mask
-            pooled_output = embeds * attn_mask_float.unsqueeze(-1)
+            if attn_mask_float is not None:
+                pooled_output = embeds * attn_mask_float.unsqueeze(-1)
+            else:
+                pooled_output = embeds
 
             # reduce along dim 1 while maintaining batch and dim 2
             pooled_output_sum = pooled_output.sum(dim=1)
-            attn_mask_sum = attn_mask_float.sum(dim=1).unsqueeze(-1)
 
-            pooled_output = pooled_output_sum / attn_mask_sum
+            if attn_mask_float is not None:
+                attn_mask_sum = attn_mask_float.sum(dim=1).unsqueeze(-1)
+
+                pooled_output = pooled_output_sum / attn_mask_sum
 
             pooled_embeds = self.text_projection(pooled_output)