From 2f87f123b2415e9bb3bb091758fb6e8f5b776caa Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sat, 27 Jul 2024 13:23:39 -0700 Subject: [PATCH] less aggressive clip skip to make CivitAI pony image meta works better --- modules_forge/forge_clip.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/modules_forge/forge_clip.py b/modules_forge/forge_clip.py index 3dc8584a..7c787c16 100644 --- a/modules_forge/forge_clip.py +++ b/modules_forge/forge_clip.py @@ -13,18 +13,18 @@ def move_clip_to_gpu(): return -def apply_clip_skip_to_transformer_outputs(x, last_layer, skip): - return x.hidden_states[last_layer + 1 - skip] - - class CLIP_SD_15_L(FrozenCLIPEmbedderWithCustomWords): + def __init__(self, wrapped, hijack): + super().__init__(wrapped, hijack) + self.minimal_clip_skip = 1 + def encode_with_transformers(self, tokens): move_clip_to_gpu() self.wrapped.transformer.text_model.embeddings.to(tokens.device) outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=-opts.CLIP_stop_at_last_layers) - if opts.CLIP_stop_at_last_layers > 1: - z = apply_clip_skip_to_transformer_outputs(outputs, last_layer=-1, skip=opts.CLIP_stop_at_last_layers) + if opts.CLIP_stop_at_last_layers > self.minimal_clip_skip: + z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers] z = self.wrapped.transformer.text_model.final_layer_norm(z) else: z = outputs.last_hidden_state @@ -43,14 +43,15 @@ class CLIP_SD_21_H(FrozenCLIPEmbedderWithCustomWords): self.id_start = 49406 self.id_end = 49407 self.id_pad = 0 + self.minimal_clip_skip = 2 def encode_with_transformers(self, tokens): move_clip_to_gpu() self.wrapped.transformer.text_model.embeddings.to(tokens.device) outputs = self.wrapped.transformer(tokens, output_hidden_states=self.wrapped.layer == "hidden") - if opts.CLIP_stop_at_last_layers > 1: - z = apply_clip_skip_to_transformer_outputs(outputs, last_layer=self.wrapped.layer_idx, skip=opts.CLIP_stop_at_last_layers) + if opts.CLIP_stop_at_last_layers > self.minimal_clip_skip: + z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers] z = self.wrapped.transformer.text_model.final_layer_norm(z) elif self.wrapped.layer == "last": z = outputs.last_hidden_state @@ -64,13 +65,14 @@ class CLIP_SD_21_H(FrozenCLIPEmbedderWithCustomWords): class CLIP_SD_XL_L(FrozenCLIPEmbedderWithCustomWords): def __init__(self, wrapped, hijack): super().__init__(wrapped, hijack) + self.minimal_clip_skip = 2 def encode_with_transformers(self, tokens): self.wrapped.transformer.text_model.embeddings.to(tokens.device) outputs = self.wrapped.transformer(tokens, output_hidden_states=self.wrapped.layer == "hidden") - if opts.CLIP_stop_at_last_layers > 1: - z = apply_clip_skip_to_transformer_outputs(outputs, last_layer=self.wrapped.layer_idx, skip=opts.CLIP_stop_at_last_layers) + if opts.CLIP_stop_at_last_layers > self.minimal_clip_skip: + z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers] elif self.wrapped.layer == "last": z = outputs.last_hidden_state else: @@ -90,13 +92,14 @@ class CLIP_SD_XL_G(FrozenCLIPEmbedderWithCustomWords): self.id_start = 49406 self.id_end = 49407 self.id_pad = 0 + self.minimal_clip_skip = 2 def encode_with_transformers(self, tokens): self.wrapped.transformer.text_model.embeddings.to(tokens.device) outputs = self.wrapped.transformer(tokens, output_hidden_states=self.wrapped.layer == "hidden") - if opts.CLIP_stop_at_last_layers > 1: - z = apply_clip_skip_to_transformer_outputs(outputs, last_layer=self.wrapped.layer_idx, skip=opts.CLIP_stop_at_last_layers) + if opts.CLIP_stop_at_last_layers > self.minimal_clip_skip: + z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers] elif self.wrapped.layer == "last": z = outputs.last_hidden_state else: