mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-26 11:09:50 +00:00
Kandinsky5 model support (#10988)
* Add Kandinsky5 model support lite and pro T2V tested to work * Update kandinsky5.py * Fix fp8 * Fix fp8_scaled text encoder * Add transformer_options for attention * Code cleanup, optimizations, use fp32 for all layers originally at fp32 * ImageToVideo -node * Fix I2V, add necessary latent post process nodes * Support text to image model * Support block replace patches (SLG mostly) * Support official LoRAs * Don't scale RoPE for lite model as that just doesn't work... * Update supported_models.py * Rever RoPE scaling to simpler one * Fix typo * Handle latent dim difference for image model in the VAE instead * Add node to use different prompts for clip_l and qwen25_7b * Reduce peak VRAM usage a bit * Further reduce peak VRAM consumption by chunking ffn * Update chunking * Update memory_usage_factor * Code cleanup, don't force the fp32 layers as it has minimal effect * Allow for stronger changes with first frames normalization Default values are too weak for any meaningful changes, these should probably be exposed as advanced node options when that's available. * Add image model's own chat template, remove unused image2video template * Remove hard error in ReplaceVideoLatentFrames -node * Update kandinsky5.py * Update supported_models.py * Fix typos in prompt template They were now fixed in the original repository as well * Update ReplaceVideoLatentFrames Add tooltips Make source optional Better handle negative index * Rename NormalizeVideoLatentFrames -node For bit better clarity what it does * Fix NormalizeVideoLatentStart node out on non-op
This commit is contained in:
@@ -611,6 +611,24 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.')
|
||||
return dit_config
|
||||
|
||||
if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5
|
||||
dit_config = {}
|
||||
model_dim = state_dict['{}visual_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
|
||||
dit_config["model_dim"] = model_dim
|
||||
if model_dim in [4096, 2560]: # pro video and lite image
|
||||
dit_config["axes_dims"] = (32, 48, 48)
|
||||
if model_dim == 2560: # lite image
|
||||
dit_config["rope_scale_factor"] = (1.0, 1.0, 1.0)
|
||||
elif model_dim == 1792: # lite video
|
||||
dit_config["axes_dims"] = (16, 24, 24)
|
||||
dit_config["time_dim"] = state_dict['{}time_embeddings.in_layer.bias'.format(key_prefix)].shape[0]
|
||||
dit_config["image_model"] = "kandinsky5"
|
||||
dit_config["ff_dim"] = state_dict['{}visual_transformer_blocks.0.feed_forward.in_layer.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["visual_embed_dim"] = state_dict['{}visual_embeddings.in_layer.weight'.format(key_prefix)].shape[1]
|
||||
dit_config["num_text_blocks"] = count_blocks(state_dict_keys, '{}text_transformer_blocks.'.format(key_prefix) + '{}.')
|
||||
dit_config["num_visual_blocks"] = count_blocks(state_dict_keys, '{}visual_transformer_blocks.'.format(key_prefix) + '{}.')
|
||||
return dit_config
|
||||
|
||||
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
|
||||
return None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user