diff --git a/backend/attention.py b/backend/attention.py index b72dc07e..6cf13ff2 100644 --- a/backend/attention.py +++ b/backend/attention.py @@ -22,7 +22,7 @@ if memory_management.xformers_enabled(): FORCE_UPCAST_ATTENTION_DTYPE = memory_management.force_upcast_attention_dtype() -def get_attn_precision(attn_precision): +def get_attn_precision(attn_precision=torch.float32): if args.disable_attention_upcast: return None if FORCE_UPCAST_ATTENTION_DTYPE is not None: diff --git a/backend/modules/k_model.py b/backend/modules/k_model.py index e795b94d..03e1b644 100644 --- a/backend/modules/k_model.py +++ b/backend/modules/k_model.py @@ -1,6 +1,6 @@ import torch -from backend import memory_management +from backend import memory_management, attention from backend.modules.k_prediction import k_prediction_from_diffusers_scheduler @@ -41,14 +41,11 @@ class KModel(torch.nn.Module): area = input_shape[0] * input_shape[2] * input_shape[3] dtype_size = memory_management.dtype_size(self.computation_dtype) - scaler = 1.28 - - # TODO: Consider these again - # if ldm_patched.modules.model_management.xformers_enabled() or ldm_patched.modules.model_management.pytorch_attention_flash_attention(): - # scaler = 1.28 - # else: - # scaler = 1.65 - # if ldm_patched.ldm.modules.attention._ATTN_PRECISION == "fp32": - # dtype_size = 4 + if attention.attention_function in [attention.attention_pytorch, attention.attention_xformers]: + scaler = 1.28 + else: + scaler = 1.65 + if attention.get_attn_precision() == torch.float32: + dtype_size = 4 return scaler * area * dtype_size * 16384 diff --git a/backend/nn/unet.py b/backend/nn/unet.py index 7002ec76..a8c721d6 100644 --- a/backend/nn/unet.py +++ b/backend/nn/unet.py @@ -174,9 +174,7 @@ class CrossAttention(nn.Module): class BasicTransformerBlock(nn.Module): def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True, ff_in=False, - inner_dim=None, - disable_self_attn=False, disable_temporal_crossattention=False, switch_temporal_ca_to_sa=False, - dtype=None, device=None): + inner_dim=None, disable_self_attn=False, dtype=None, device=None): super().__init__() self.ff_in = ff_in or inner_dim is not None @@ -193,23 +191,13 @@ class BasicTransformerBlock(nn.Module): self.attn1 = CrossAttention(query_dim=inner_dim, heads=n_heads, dim_head=d_head, dropout=dropout, context_dim=context_dim if self.disable_self_attn else None, dtype=dtype, device=device) - self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device) - - if disable_temporal_crossattention: - if switch_temporal_ca_to_sa: - raise ValueError - else: - self.attn2 = None - else: - context_dim_attn2 = None - if not switch_temporal_ca_to_sa: - context_dim_attn2 = context_dim - - self.attn2 = CrossAttention(query_dim=inner_dim, context_dim=context_dim_attn2, - heads=n_heads, dim_head=d_head, dropout=dropout, dtype=dtype, device=device) - self.norm2 = nn.LayerNorm(inner_dim, dtype=dtype, device=device) - self.norm1 = nn.LayerNorm(inner_dim, dtype=dtype, device=device) + + self.attn2 = CrossAttention(query_dim=inner_dim, context_dim=context_dim, + heads=n_heads, dim_head=d_head, dropout=dropout, dtype=dtype, device=device) + self.norm2 = nn.LayerNorm(inner_dim, dtype=dtype, device=device) + + self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device) self.norm3 = nn.LayerNorm(inner_dim, dtype=dtype, device=device) self.checkpoint = checkpoint self.n_heads = n_heads