fixed issues with converting and saving models. Cleaned keys. Improved testing for cycle load saving.

This commit is contained in:
Jaret Burkett
2023-08-29 12:31:19 -06:00
parent 714854ee86
commit 14ff51ceb4
9 changed files with 784 additions and 1568 deletions

View File

@@ -481,22 +481,14 @@ class BaseSDTrainProcess(BaseTrainProcess):
params = self.embedding.get_trainable_params()
else:
params = []
# assume dreambooth/finetune
if self.train_config.train_text_encoder:
if self.sd.is_xl:
for te in text_encoder:
te.requires_grad_(True)
te.train()
params += te.parameters()
else:
text_encoder.requires_grad_(True)
text_encoder.train()
params += text_encoder.parameters()
if self.train_config.train_unet:
unet.requires_grad_(True)
unet.train()
params += unet.parameters()
params = self.sd.prepare_optimizer_params(
vae=False,
unet=self.train_config.train_unet,
text_encoder=self.train_config.train_text_encoder,
text_encoder_lr=self.train_config.lr,
unet_lr=self.train_config.lr,
default_lr=self.train_config.lr
)
### HOOK ###
params = self.hook_add_extra_train_params(params)

View File

@@ -95,6 +95,8 @@ matched_diffusers_keys = []
error_margin = 1e-4
tmp_merge_key = "TMP___MERGE"
te_suffix = ''
proj_pattern_weight = None
proj_pattern_bias = None
@@ -139,7 +141,7 @@ if args.sdxl or args.sd2:
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.v_proj.weight")
# make diffusers convertable_dict
diffusers_state_dict[
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.MERGED.weight"] = new_val
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.{tmp_merge_key}.weight"] = new_val
# add operator
ldm_operator_map[ldm_key] = {
@@ -148,7 +150,6 @@ if args.sdxl or args.sd2:
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.k_proj.weight",
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.v_proj.weight",
],
"target": f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.MERGED.weight"
}
# text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
@@ -189,7 +190,7 @@ if args.sdxl or args.sd2:
matched_diffusers_keys.append(f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.v_proj.bias")
# make diffusers convertable_dict
diffusers_state_dict[
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.MERGED.bias"] = new_val
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.{tmp_merge_key}.bias"] = new_val
# add operator
ldm_operator_map[ldm_key] = {
@@ -198,7 +199,6 @@ if args.sdxl or args.sd2:
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.k_proj.bias",
f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.v_proj.bias",
],
# "target": f"te{te_suffix}_text_model.encoder.layers.{number}.self_attn.MERGED.bias"
}
# add diffusers operators
@@ -359,13 +359,35 @@ for key in unmatched_ldm_keys:
save_file(remaining_ldm_values, os.path.join(KEYMAPS_FOLDER, f'{name}_ldm_base.safetensors'))
print(f'Saved remaining ldm values to {os.path.join(KEYMAPS_FOLDER, f"{name}_ldm_base.safetensors")}')
# do cleanup of some left overs and bugs
to_remove = []
for ldm_key, diffusers_key in ldm_diffusers_keymap.items():
# get rid of tmp merge keys used to slicing
if tmp_merge_key in diffusers_key or tmp_merge_key in ldm_key:
to_remove.append(ldm_key)
for key in to_remove:
del ldm_diffusers_keymap[key]
to_remove = []
# remove identical shape mappings. Not sure why they exist but they do
for ldm_key, shape_list in ldm_diffusers_shape_map.items():
# remove identical shape mappings. Not sure why they exist but they do
# convert to json string to make it easier to compare
ldm_shape = json.dumps(shape_list[0])
diffusers_shape = json.dumps(shape_list[1])
if ldm_shape == diffusers_shape:
to_remove.append(ldm_key)
for key in to_remove:
del ldm_diffusers_shape_map[key]
dest_path = os.path.join(KEYMAPS_FOLDER, f'{name}.json')
save_obj = OrderedDict()
save_obj["ldm_diffusers_keymap"] = ldm_diffusers_keymap
save_obj["ldm_diffusers_shape_map"] = ldm_diffusers_shape_map
save_obj["ldm_diffusers_operator_map"] = ldm_operator_map
save_obj["diffusers_ldm_operator_map"] = diffusers_operator_map
with open(dest_path, 'w') as f:
f.write(json.dumps(save_obj, indent=4))

View File

@@ -67,6 +67,8 @@ class TrainConfig:
self.noise_scheduler = kwargs.get('noise_scheduler', 'ddpm')
self.steps: int = kwargs.get('steps', 1000)
self.lr = kwargs.get('lr', 1e-6)
self.unet_lr = kwargs.get('unet_lr', self.lr)
self.text_encoder_lr = kwargs.get('text_encoder_lr', self.lr)
self.optimizer = kwargs.get('optimizer', 'adamw')
self.lr_scheduler = kwargs.get('lr_scheduler', 'constant')
self.max_denoising_steps: int = kwargs.get('max_denoising_steps', 50)

View File

@@ -1180,62 +1180,6 @@
512
]
],
"first_stage_model.decoder.up.0.block.0.nin_shortcut.weight": [
[
128,
256,
1,
1
],
[
128,
256,
1,
1
]
],
"first_stage_model.decoder.up.1.block.0.nin_shortcut.weight": [
[
256,
512,
1,
1
],
[
256,
512,
1,
1
]
],
"first_stage_model.encoder.down.1.block.0.nin_shortcut.weight": [
[
256,
128,
1,
1
],
[
256,
128,
1,
1
]
],
"first_stage_model.encoder.down.2.block.0.nin_shortcut.weight": [
[
512,
256,
1,
1
],
[
512,
256,
1,
1
]
],
"first_stage_model.encoder.mid.attn_1.k.weight": [
[
512,
@@ -1283,678 +1227,6 @@
512,
512
]
],
"first_stage_model.post_quant_conv.weight": [
[
4,
4,
1,
1
],
[
4,
4,
1,
1
]
],
"first_stage_model.quant_conv.weight": [
[
8,
8,
1,
1
],
[
8,
8,
1,
1
]
],
"model.diffusion_model.input_blocks.1.1.proj_in.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.input_blocks.1.1.proj_out.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.input_blocks.2.1.proj_in.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.input_blocks.2.1.proj_out.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.input_blocks.4.0.skip_connection.weight": [
[
640,
320,
1,
1
],
[
640,
320,
1,
1
]
],
"model.diffusion_model.input_blocks.4.1.proj_in.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.input_blocks.4.1.proj_out.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.input_blocks.5.1.proj_in.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.input_blocks.5.1.proj_out.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.input_blocks.7.0.skip_connection.weight": [
[
1280,
640,
1,
1
],
[
1280,
640,
1,
1
]
],
"model.diffusion_model.input_blocks.7.1.proj_in.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.input_blocks.7.1.proj_out.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.input_blocks.8.1.proj_in.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.input_blocks.8.1.proj_out.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.middle_block.1.proj_in.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.middle_block.1.proj_out.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.0.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.1.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.10.0.skip_connection.weight": [
[
320,
640,
1,
1
],
[
320,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.10.1.proj_in.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.output_blocks.10.1.proj_out.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.output_blocks.11.0.skip_connection.weight": [
[
320,
640,
1,
1
],
[
320,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.11.1.proj_in.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.output_blocks.11.1.proj_out.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.output_blocks.2.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.3.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.3.1.proj_in.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.3.1.proj_out.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.4.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.4.1.proj_in.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.4.1.proj_out.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.5.0.skip_connection.weight": [
[
1280,
1920,
1,
1
],
[
1280,
1920,
1,
1
]
],
"model.diffusion_model.output_blocks.5.1.proj_in.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.5.1.proj_out.weight": [
[
1280,
1280,
1,
1
],
[
1280,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.6.0.skip_connection.weight": [
[
640,
1920,
1,
1
],
[
640,
1920,
1,
1
]
],
"model.diffusion_model.output_blocks.6.1.proj_in.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.6.1.proj_out.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.7.0.skip_connection.weight": [
[
640,
1280,
1,
1
],
[
640,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.7.1.proj_in.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.7.1.proj_out.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.8.0.skip_connection.weight": [
[
640,
960,
1,
1
],
[
640,
960,
1,
1
]
],
"model.diffusion_model.output_blocks.8.1.proj_in.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.8.1.proj_out.weight": [
[
640,
640,
1,
1
],
[
640,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.9.0.skip_connection.weight": [
[
320,
960,
1,
1
],
[
320,
960,
1,
1
]
],
"model.diffusion_model.output_blocks.9.1.proj_in.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
],
"model.diffusion_model.output_blocks.9.1.proj_out.weight": [
[
320,
320,
1,
1
],
[
320,
320,
1,
1
]
]
},
"ldm_diffusers_operator_map": {},

View File

@@ -4,8 +4,6 @@
"cond_stage_model.model.ln_final.weight": "te_text_model.final_layer_norm.weight",
"cond_stage_model.model.positional_embedding": "te_text_model.embeddings.position_embedding.weight",
"cond_stage_model.model.token_embedding.weight": "te_text_model.embeddings.token_embedding.weight",
"cond_stage_model.model.transformer.resblocks.0.attn.in_proj_bias": "te_text_model.encoder.layers.0.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.0.attn.in_proj_weight": "te_text_model.encoder.layers.0.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.0.attn.out_proj.bias": "te_text_model.encoder.layers.0.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.0.attn.out_proj.weight": "te_text_model.encoder.layers.0.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.0.ln_1.bias": "te_text_model.encoder.layers.0.layer_norm1.bias",
@@ -16,8 +14,6 @@
"cond_stage_model.model.transformer.resblocks.0.mlp.c_fc.weight": "te_text_model.encoder.layers.0.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.bias": "te_text_model.encoder.layers.0.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.weight": "te_text_model.encoder.layers.0.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.1.attn.in_proj_bias": "te_text_model.encoder.layers.1.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.1.attn.in_proj_weight": "te_text_model.encoder.layers.1.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.1.attn.out_proj.bias": "te_text_model.encoder.layers.1.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.1.attn.out_proj.weight": "te_text_model.encoder.layers.1.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.1.ln_1.bias": "te_text_model.encoder.layers.1.layer_norm1.bias",
@@ -28,8 +24,6 @@
"cond_stage_model.model.transformer.resblocks.1.mlp.c_fc.weight": "te_text_model.encoder.layers.1.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.1.mlp.c_proj.bias": "te_text_model.encoder.layers.1.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.1.mlp.c_proj.weight": "te_text_model.encoder.layers.1.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.10.attn.in_proj_bias": "te_text_model.encoder.layers.10.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.10.attn.in_proj_weight": "te_text_model.encoder.layers.10.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.10.attn.out_proj.bias": "te_text_model.encoder.layers.10.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.10.attn.out_proj.weight": "te_text_model.encoder.layers.10.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.10.ln_1.bias": "te_text_model.encoder.layers.10.layer_norm1.bias",
@@ -40,8 +34,6 @@
"cond_stage_model.model.transformer.resblocks.10.mlp.c_fc.weight": "te_text_model.encoder.layers.10.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.10.mlp.c_proj.bias": "te_text_model.encoder.layers.10.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.10.mlp.c_proj.weight": "te_text_model.encoder.layers.10.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.11.attn.in_proj_bias": "te_text_model.encoder.layers.11.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.11.attn.in_proj_weight": "te_text_model.encoder.layers.11.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.11.attn.out_proj.bias": "te_text_model.encoder.layers.11.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.11.attn.out_proj.weight": "te_text_model.encoder.layers.11.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.11.ln_1.bias": "te_text_model.encoder.layers.11.layer_norm1.bias",
@@ -52,8 +44,6 @@
"cond_stage_model.model.transformer.resblocks.11.mlp.c_fc.weight": "te_text_model.encoder.layers.11.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.11.mlp.c_proj.bias": "te_text_model.encoder.layers.11.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.11.mlp.c_proj.weight": "te_text_model.encoder.layers.11.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.12.attn.in_proj_bias": "te_text_model.encoder.layers.12.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.12.attn.in_proj_weight": "te_text_model.encoder.layers.12.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.12.attn.out_proj.bias": "te_text_model.encoder.layers.12.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.12.attn.out_proj.weight": "te_text_model.encoder.layers.12.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.12.ln_1.bias": "te_text_model.encoder.layers.12.layer_norm1.bias",
@@ -64,8 +54,6 @@
"cond_stage_model.model.transformer.resblocks.12.mlp.c_fc.weight": "te_text_model.encoder.layers.12.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.12.mlp.c_proj.bias": "te_text_model.encoder.layers.12.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.12.mlp.c_proj.weight": "te_text_model.encoder.layers.12.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.13.attn.in_proj_bias": "te_text_model.encoder.layers.13.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.13.attn.in_proj_weight": "te_text_model.encoder.layers.13.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.13.attn.out_proj.bias": "te_text_model.encoder.layers.13.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.13.attn.out_proj.weight": "te_text_model.encoder.layers.13.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.13.ln_1.bias": "te_text_model.encoder.layers.13.layer_norm1.bias",
@@ -76,8 +64,6 @@
"cond_stage_model.model.transformer.resblocks.13.mlp.c_fc.weight": "te_text_model.encoder.layers.13.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.13.mlp.c_proj.bias": "te_text_model.encoder.layers.13.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.13.mlp.c_proj.weight": "te_text_model.encoder.layers.13.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.14.attn.in_proj_bias": "te_text_model.encoder.layers.14.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.14.attn.in_proj_weight": "te_text_model.encoder.layers.14.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.14.attn.out_proj.bias": "te_text_model.encoder.layers.14.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.14.attn.out_proj.weight": "te_text_model.encoder.layers.14.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.14.ln_1.bias": "te_text_model.encoder.layers.14.layer_norm1.bias",
@@ -88,8 +74,6 @@
"cond_stage_model.model.transformer.resblocks.14.mlp.c_fc.weight": "te_text_model.encoder.layers.14.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.14.mlp.c_proj.bias": "te_text_model.encoder.layers.14.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.14.mlp.c_proj.weight": "te_text_model.encoder.layers.14.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.15.attn.in_proj_bias": "te_text_model.encoder.layers.15.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.15.attn.in_proj_weight": "te_text_model.encoder.layers.15.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.15.attn.out_proj.bias": "te_text_model.encoder.layers.15.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.15.attn.out_proj.weight": "te_text_model.encoder.layers.15.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.15.ln_1.bias": "te_text_model.encoder.layers.15.layer_norm1.bias",
@@ -100,8 +84,6 @@
"cond_stage_model.model.transformer.resblocks.15.mlp.c_fc.weight": "te_text_model.encoder.layers.15.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.15.mlp.c_proj.bias": "te_text_model.encoder.layers.15.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.15.mlp.c_proj.weight": "te_text_model.encoder.layers.15.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.16.attn.in_proj_bias": "te_text_model.encoder.layers.16.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.16.attn.in_proj_weight": "te_text_model.encoder.layers.16.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.16.attn.out_proj.bias": "te_text_model.encoder.layers.16.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.16.attn.out_proj.weight": "te_text_model.encoder.layers.16.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.16.ln_1.bias": "te_text_model.encoder.layers.16.layer_norm1.bias",
@@ -112,8 +94,6 @@
"cond_stage_model.model.transformer.resblocks.16.mlp.c_fc.weight": "te_text_model.encoder.layers.16.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.16.mlp.c_proj.bias": "te_text_model.encoder.layers.16.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.16.mlp.c_proj.weight": "te_text_model.encoder.layers.16.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.17.attn.in_proj_bias": "te_text_model.encoder.layers.17.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.17.attn.in_proj_weight": "te_text_model.encoder.layers.17.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.17.attn.out_proj.bias": "te_text_model.encoder.layers.17.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.17.attn.out_proj.weight": "te_text_model.encoder.layers.17.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.17.ln_1.bias": "te_text_model.encoder.layers.17.layer_norm1.bias",
@@ -124,8 +104,6 @@
"cond_stage_model.model.transformer.resblocks.17.mlp.c_fc.weight": "te_text_model.encoder.layers.17.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.17.mlp.c_proj.bias": "te_text_model.encoder.layers.17.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.17.mlp.c_proj.weight": "te_text_model.encoder.layers.17.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.18.attn.in_proj_bias": "te_text_model.encoder.layers.18.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.18.attn.in_proj_weight": "te_text_model.encoder.layers.18.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.18.attn.out_proj.bias": "te_text_model.encoder.layers.18.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.18.attn.out_proj.weight": "te_text_model.encoder.layers.18.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.18.ln_1.bias": "te_text_model.encoder.layers.18.layer_norm1.bias",
@@ -136,8 +114,6 @@
"cond_stage_model.model.transformer.resblocks.18.mlp.c_fc.weight": "te_text_model.encoder.layers.18.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.18.mlp.c_proj.bias": "te_text_model.encoder.layers.18.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.18.mlp.c_proj.weight": "te_text_model.encoder.layers.18.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.19.attn.in_proj_bias": "te_text_model.encoder.layers.19.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.19.attn.in_proj_weight": "te_text_model.encoder.layers.19.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.19.attn.out_proj.bias": "te_text_model.encoder.layers.19.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.19.attn.out_proj.weight": "te_text_model.encoder.layers.19.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.19.ln_1.bias": "te_text_model.encoder.layers.19.layer_norm1.bias",
@@ -148,8 +124,6 @@
"cond_stage_model.model.transformer.resblocks.19.mlp.c_fc.weight": "te_text_model.encoder.layers.19.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.19.mlp.c_proj.bias": "te_text_model.encoder.layers.19.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.19.mlp.c_proj.weight": "te_text_model.encoder.layers.19.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.2.attn.in_proj_bias": "te_text_model.encoder.layers.2.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.2.attn.in_proj_weight": "te_text_model.encoder.layers.2.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.2.attn.out_proj.bias": "te_text_model.encoder.layers.2.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.2.attn.out_proj.weight": "te_text_model.encoder.layers.2.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.2.ln_1.bias": "te_text_model.encoder.layers.2.layer_norm1.bias",
@@ -160,8 +134,6 @@
"cond_stage_model.model.transformer.resblocks.2.mlp.c_fc.weight": "te_text_model.encoder.layers.2.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.2.mlp.c_proj.bias": "te_text_model.encoder.layers.2.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.2.mlp.c_proj.weight": "te_text_model.encoder.layers.2.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.20.attn.in_proj_bias": "te_text_model.encoder.layers.20.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.20.attn.in_proj_weight": "te_text_model.encoder.layers.20.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.20.attn.out_proj.bias": "te_text_model.encoder.layers.20.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.20.attn.out_proj.weight": "te_text_model.encoder.layers.20.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.20.ln_1.bias": "te_text_model.encoder.layers.20.layer_norm1.bias",
@@ -172,8 +144,6 @@
"cond_stage_model.model.transformer.resblocks.20.mlp.c_fc.weight": "te_text_model.encoder.layers.20.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.20.mlp.c_proj.bias": "te_text_model.encoder.layers.20.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.20.mlp.c_proj.weight": "te_text_model.encoder.layers.20.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.21.attn.in_proj_bias": "te_text_model.encoder.layers.21.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.21.attn.in_proj_weight": "te_text_model.encoder.layers.21.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.21.attn.out_proj.bias": "te_text_model.encoder.layers.21.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.21.attn.out_proj.weight": "te_text_model.encoder.layers.21.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.21.ln_1.bias": "te_text_model.encoder.layers.21.layer_norm1.bias",
@@ -184,8 +154,6 @@
"cond_stage_model.model.transformer.resblocks.21.mlp.c_fc.weight": "te_text_model.encoder.layers.21.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.21.mlp.c_proj.bias": "te_text_model.encoder.layers.21.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.21.mlp.c_proj.weight": "te_text_model.encoder.layers.21.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.22.attn.in_proj_bias": "te_text_model.encoder.layers.22.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.22.attn.in_proj_weight": "te_text_model.encoder.layers.22.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.22.attn.out_proj.bias": "te_text_model.encoder.layers.22.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.22.attn.out_proj.weight": "te_text_model.encoder.layers.22.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.22.ln_1.bias": "te_text_model.encoder.layers.22.layer_norm1.bias",
@@ -196,8 +164,6 @@
"cond_stage_model.model.transformer.resblocks.22.mlp.c_fc.weight": "te_text_model.encoder.layers.22.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.22.mlp.c_proj.bias": "te_text_model.encoder.layers.22.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.22.mlp.c_proj.weight": "te_text_model.encoder.layers.22.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.3.attn.in_proj_bias": "te_text_model.encoder.layers.3.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.3.attn.in_proj_weight": "te_text_model.encoder.layers.3.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.3.attn.out_proj.bias": "te_text_model.encoder.layers.3.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.3.attn.out_proj.weight": "te_text_model.encoder.layers.3.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.3.ln_1.bias": "te_text_model.encoder.layers.3.layer_norm1.bias",
@@ -208,8 +174,6 @@
"cond_stage_model.model.transformer.resblocks.3.mlp.c_fc.weight": "te_text_model.encoder.layers.3.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.3.mlp.c_proj.bias": "te_text_model.encoder.layers.3.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.3.mlp.c_proj.weight": "te_text_model.encoder.layers.3.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.4.attn.in_proj_bias": "te_text_model.encoder.layers.4.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.4.attn.in_proj_weight": "te_text_model.encoder.layers.4.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.4.attn.out_proj.bias": "te_text_model.encoder.layers.4.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.4.attn.out_proj.weight": "te_text_model.encoder.layers.4.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.4.ln_1.bias": "te_text_model.encoder.layers.4.layer_norm1.bias",
@@ -220,8 +184,6 @@
"cond_stage_model.model.transformer.resblocks.4.mlp.c_fc.weight": "te_text_model.encoder.layers.4.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.4.mlp.c_proj.bias": "te_text_model.encoder.layers.4.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.4.mlp.c_proj.weight": "te_text_model.encoder.layers.4.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.5.attn.in_proj_bias": "te_text_model.encoder.layers.5.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.5.attn.in_proj_weight": "te_text_model.encoder.layers.5.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.5.attn.out_proj.bias": "te_text_model.encoder.layers.5.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.5.attn.out_proj.weight": "te_text_model.encoder.layers.5.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.5.ln_1.bias": "te_text_model.encoder.layers.5.layer_norm1.bias",
@@ -232,8 +194,6 @@
"cond_stage_model.model.transformer.resblocks.5.mlp.c_fc.weight": "te_text_model.encoder.layers.5.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.5.mlp.c_proj.bias": "te_text_model.encoder.layers.5.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.5.mlp.c_proj.weight": "te_text_model.encoder.layers.5.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.6.attn.in_proj_bias": "te_text_model.encoder.layers.6.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.6.attn.in_proj_weight": "te_text_model.encoder.layers.6.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.6.attn.out_proj.bias": "te_text_model.encoder.layers.6.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.6.attn.out_proj.weight": "te_text_model.encoder.layers.6.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.6.ln_1.bias": "te_text_model.encoder.layers.6.layer_norm1.bias",
@@ -244,8 +204,6 @@
"cond_stage_model.model.transformer.resblocks.6.mlp.c_fc.weight": "te_text_model.encoder.layers.6.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.6.mlp.c_proj.bias": "te_text_model.encoder.layers.6.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.6.mlp.c_proj.weight": "te_text_model.encoder.layers.6.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.7.attn.in_proj_bias": "te_text_model.encoder.layers.7.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.7.attn.in_proj_weight": "te_text_model.encoder.layers.7.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.7.attn.out_proj.bias": "te_text_model.encoder.layers.7.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.7.attn.out_proj.weight": "te_text_model.encoder.layers.7.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.7.ln_1.bias": "te_text_model.encoder.layers.7.layer_norm1.bias",
@@ -256,8 +214,6 @@
"cond_stage_model.model.transformer.resblocks.7.mlp.c_fc.weight": "te_text_model.encoder.layers.7.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.7.mlp.c_proj.bias": "te_text_model.encoder.layers.7.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.7.mlp.c_proj.weight": "te_text_model.encoder.layers.7.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.8.attn.in_proj_bias": "te_text_model.encoder.layers.8.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.8.attn.in_proj_weight": "te_text_model.encoder.layers.8.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.8.attn.out_proj.bias": "te_text_model.encoder.layers.8.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.8.attn.out_proj.weight": "te_text_model.encoder.layers.8.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.8.ln_1.bias": "te_text_model.encoder.layers.8.layer_norm1.bias",
@@ -268,8 +224,6 @@
"cond_stage_model.model.transformer.resblocks.8.mlp.c_fc.weight": "te_text_model.encoder.layers.8.mlp.fc1.weight",
"cond_stage_model.model.transformer.resblocks.8.mlp.c_proj.bias": "te_text_model.encoder.layers.8.mlp.fc2.bias",
"cond_stage_model.model.transformer.resblocks.8.mlp.c_proj.weight": "te_text_model.encoder.layers.8.mlp.fc2.weight",
"cond_stage_model.model.transformer.resblocks.9.attn.in_proj_bias": "te_text_model.encoder.layers.9.self_attn.MERGED.bias",
"cond_stage_model.model.transformer.resblocks.9.attn.in_proj_weight": "te_text_model.encoder.layers.9.self_attn.MERGED.weight",
"cond_stage_model.model.transformer.resblocks.9.attn.out_proj.bias": "te_text_model.encoder.layers.9.self_attn.out_proj.bias",
"cond_stage_model.model.transformer.resblocks.9.attn.out_proj.weight": "te_text_model.encoder.layers.9.self_attn.out_proj.weight",
"cond_stage_model.model.transformer.resblocks.9.ln_1.bias": "te_text_model.encoder.layers.9.layer_norm1.bias",
@@ -1264,62 +1218,6 @@
512
]
],
"first_stage_model.decoder.up.0.block.0.nin_shortcut.weight": [
[
128,
256,
1,
1
],
[
128,
256,
1,
1
]
],
"first_stage_model.decoder.up.1.block.0.nin_shortcut.weight": [
[
256,
512,
1,
1
],
[
256,
512,
1,
1
]
],
"first_stage_model.encoder.down.1.block.0.nin_shortcut.weight": [
[
256,
128,
1,
1
],
[
256,
128,
1,
1
]
],
"first_stage_model.encoder.down.2.block.0.nin_shortcut.weight": [
[
512,
256,
1,
1
],
[
512,
256,
1,
1
]
],
"first_stage_model.encoder.mid.attn_1.k.weight": [
[
512,
@@ -1367,230 +1265,6 @@
512,
512
]
],
"first_stage_model.post_quant_conv.weight": [
[
4,
4,
1,
1
],
[
4,
4,
1,
1
]
],
"first_stage_model.quant_conv.weight": [
[
8,
8,
1,
1
],
[
8,
8,
1,
1
]
],
"model.diffusion_model.input_blocks.4.0.skip_connection.weight": [
[
640,
320,
1,
1
],
[
640,
320,
1,
1
]
],
"model.diffusion_model.input_blocks.7.0.skip_connection.weight": [
[
1280,
640,
1,
1
],
[
1280,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.0.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.1.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.10.0.skip_connection.weight": [
[
320,
640,
1,
1
],
[
320,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.11.0.skip_connection.weight": [
[
320,
640,
1,
1
],
[
320,
640,
1,
1
]
],
"model.diffusion_model.output_blocks.2.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.3.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.4.0.skip_connection.weight": [
[
1280,
2560,
1,
1
],
[
1280,
2560,
1,
1
]
],
"model.diffusion_model.output_blocks.5.0.skip_connection.weight": [
[
1280,
1920,
1,
1
],
[
1280,
1920,
1,
1
]
],
"model.diffusion_model.output_blocks.6.0.skip_connection.weight": [
[
640,
1920,
1,
1
],
[
640,
1920,
1,
1
]
],
"model.diffusion_model.output_blocks.7.0.skip_connection.weight": [
[
640,
1280,
1,
1
],
[
640,
1280,
1,
1
]
],
"model.diffusion_model.output_blocks.8.0.skip_connection.weight": [
[
640,
960,
1,
1
],
[
640,
960,
1,
1
]
],
"model.diffusion_model.output_blocks.9.0.skip_connection.weight": [
[
320,
960,
1,
1
],
[
320,
960,
1,
1
]
]
},
"ldm_diffusers_operator_map": {
@@ -1606,8 +1280,7 @@
"te_text_model.encoder.layers.0.self_attn.q_proj.weight",
"te_text_model.encoder.layers.0.self_attn.k_proj.weight",
"te_text_model.encoder.layers.0.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.0.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.1.attn.in_proj_bias": {
"cat": [
@@ -1621,8 +1294,7 @@
"te_text_model.encoder.layers.1.self_attn.q_proj.weight",
"te_text_model.encoder.layers.1.self_attn.k_proj.weight",
"te_text_model.encoder.layers.1.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.1.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.10.attn.in_proj_bias": {
"cat": [
@@ -1636,8 +1308,7 @@
"te_text_model.encoder.layers.10.self_attn.q_proj.weight",
"te_text_model.encoder.layers.10.self_attn.k_proj.weight",
"te_text_model.encoder.layers.10.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.10.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.11.attn.in_proj_bias": {
"cat": [
@@ -1651,8 +1322,7 @@
"te_text_model.encoder.layers.11.self_attn.q_proj.weight",
"te_text_model.encoder.layers.11.self_attn.k_proj.weight",
"te_text_model.encoder.layers.11.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.11.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.12.attn.in_proj_bias": {
"cat": [
@@ -1666,8 +1336,7 @@
"te_text_model.encoder.layers.12.self_attn.q_proj.weight",
"te_text_model.encoder.layers.12.self_attn.k_proj.weight",
"te_text_model.encoder.layers.12.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.12.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.13.attn.in_proj_bias": {
"cat": [
@@ -1681,8 +1350,7 @@
"te_text_model.encoder.layers.13.self_attn.q_proj.weight",
"te_text_model.encoder.layers.13.self_attn.k_proj.weight",
"te_text_model.encoder.layers.13.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.13.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.14.attn.in_proj_bias": {
"cat": [
@@ -1696,8 +1364,7 @@
"te_text_model.encoder.layers.14.self_attn.q_proj.weight",
"te_text_model.encoder.layers.14.self_attn.k_proj.weight",
"te_text_model.encoder.layers.14.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.14.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.15.attn.in_proj_bias": {
"cat": [
@@ -1711,8 +1378,7 @@
"te_text_model.encoder.layers.15.self_attn.q_proj.weight",
"te_text_model.encoder.layers.15.self_attn.k_proj.weight",
"te_text_model.encoder.layers.15.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.15.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.16.attn.in_proj_bias": {
"cat": [
@@ -1726,8 +1392,7 @@
"te_text_model.encoder.layers.16.self_attn.q_proj.weight",
"te_text_model.encoder.layers.16.self_attn.k_proj.weight",
"te_text_model.encoder.layers.16.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.16.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.17.attn.in_proj_bias": {
"cat": [
@@ -1741,8 +1406,7 @@
"te_text_model.encoder.layers.17.self_attn.q_proj.weight",
"te_text_model.encoder.layers.17.self_attn.k_proj.weight",
"te_text_model.encoder.layers.17.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.17.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.18.attn.in_proj_bias": {
"cat": [
@@ -1756,8 +1420,7 @@
"te_text_model.encoder.layers.18.self_attn.q_proj.weight",
"te_text_model.encoder.layers.18.self_attn.k_proj.weight",
"te_text_model.encoder.layers.18.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.18.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.19.attn.in_proj_bias": {
"cat": [
@@ -1771,8 +1434,7 @@
"te_text_model.encoder.layers.19.self_attn.q_proj.weight",
"te_text_model.encoder.layers.19.self_attn.k_proj.weight",
"te_text_model.encoder.layers.19.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.19.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.2.attn.in_proj_bias": {
"cat": [
@@ -1786,8 +1448,7 @@
"te_text_model.encoder.layers.2.self_attn.q_proj.weight",
"te_text_model.encoder.layers.2.self_attn.k_proj.weight",
"te_text_model.encoder.layers.2.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.2.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.20.attn.in_proj_bias": {
"cat": [
@@ -1801,8 +1462,7 @@
"te_text_model.encoder.layers.20.self_attn.q_proj.weight",
"te_text_model.encoder.layers.20.self_attn.k_proj.weight",
"te_text_model.encoder.layers.20.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.20.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.21.attn.in_proj_bias": {
"cat": [
@@ -1816,8 +1476,7 @@
"te_text_model.encoder.layers.21.self_attn.q_proj.weight",
"te_text_model.encoder.layers.21.self_attn.k_proj.weight",
"te_text_model.encoder.layers.21.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.21.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.22.attn.in_proj_bias": {
"cat": [
@@ -1831,8 +1490,7 @@
"te_text_model.encoder.layers.22.self_attn.q_proj.weight",
"te_text_model.encoder.layers.22.self_attn.k_proj.weight",
"te_text_model.encoder.layers.22.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.22.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.3.attn.in_proj_bias": {
"cat": [
@@ -1846,8 +1504,7 @@
"te_text_model.encoder.layers.3.self_attn.q_proj.weight",
"te_text_model.encoder.layers.3.self_attn.k_proj.weight",
"te_text_model.encoder.layers.3.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.3.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.4.attn.in_proj_bias": {
"cat": [
@@ -1861,8 +1518,7 @@
"te_text_model.encoder.layers.4.self_attn.q_proj.weight",
"te_text_model.encoder.layers.4.self_attn.k_proj.weight",
"te_text_model.encoder.layers.4.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.4.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.5.attn.in_proj_bias": {
"cat": [
@@ -1876,8 +1532,7 @@
"te_text_model.encoder.layers.5.self_attn.q_proj.weight",
"te_text_model.encoder.layers.5.self_attn.k_proj.weight",
"te_text_model.encoder.layers.5.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.5.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.6.attn.in_proj_bias": {
"cat": [
@@ -1891,8 +1546,7 @@
"te_text_model.encoder.layers.6.self_attn.q_proj.weight",
"te_text_model.encoder.layers.6.self_attn.k_proj.weight",
"te_text_model.encoder.layers.6.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.6.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.7.attn.in_proj_bias": {
"cat": [
@@ -1906,8 +1560,7 @@
"te_text_model.encoder.layers.7.self_attn.q_proj.weight",
"te_text_model.encoder.layers.7.self_attn.k_proj.weight",
"te_text_model.encoder.layers.7.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.7.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.8.attn.in_proj_bias": {
"cat": [
@@ -1921,8 +1574,7 @@
"te_text_model.encoder.layers.8.self_attn.q_proj.weight",
"te_text_model.encoder.layers.8.self_attn.k_proj.weight",
"te_text_model.encoder.layers.8.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.8.self_attn.MERGED.weight"
]
},
"cond_stage_model.model.transformer.resblocks.9.attn.in_proj_bias": {
"cat": [
@@ -1936,8 +1588,7 @@
"te_text_model.encoder.layers.9.self_attn.q_proj.weight",
"te_text_model.encoder.layers.9.self_attn.k_proj.weight",
"te_text_model.encoder.layers.9.self_attn.v_proj.weight"
],
"target": "te_text_model.encoder.layers.9.self_attn.MERGED.weight"
]
}
},
"diffusers_ldm_operator_map": {

File diff suppressed because it is too large Load Diff

View File

@@ -6,16 +6,12 @@
77
],
"min": 0.0,
"max": 76.0,
"mean": 38.0,
"std": 22.375
"max": 76.0
},
"conditioner.embedders.1.model.logit_scale": {
"shape": [],
"min": 4.60546875,
"max": 4.60546875,
"mean": 4.60546875,
"std": NaN
"max": 4.60546875
},
"conditioner.embedders.1.model.text_projection": {
"shape": [
@@ -23,9 +19,7 @@
1280
],
"min": -0.15966796875,
"max": 0.230712890625,
"mean": 0.0,
"std": 0.0181732177734375
"max": 0.230712890625
}
},
"diffusers": {
@@ -35,9 +29,7 @@
1280
],
"min": -0.15966796875,
"max": 0.230712890625,
"mean": 2.128152846125886e-05,
"std": 0.018169498071074486
"max": 0.230712890625
}
}
}

View File

@@ -32,6 +32,10 @@ def convert_state_dict_to_ldm_with_mapping(
with open(mapping_path, 'r') as f:
mapping = json.load(f, object_pairs_hook=OrderedDict)
# keep track of keys not matched
ldm_matched_keys = []
diffusers_matched_keys = []
ldm_diffusers_keymap = mapping['ldm_diffusers_keymap']
ldm_diffusers_shape_map = mapping['ldm_diffusers_shape_map']
ldm_diffusers_operator_map = mapping['ldm_diffusers_operator_map']
@@ -52,11 +56,15 @@ def convert_state_dict_to_ldm_with_mapping(
for diffusers_key in ldm_diffusers_operator_map[ldm_key]['cat']:
cat_list.append(diffusers_state_dict[diffusers_key].detach())
converted_state_dict[ldm_key] = torch.cat(cat_list, dim=0).to(device, dtype=dtype)
diffusers_matched_keys.extend(ldm_diffusers_operator_map[ldm_key]['cat'])
ldm_matched_keys.append(ldm_key)
if 'slice' in ldm_diffusers_operator_map[ldm_key]:
tensor_to_slice = diffusers_state_dict[ldm_diffusers_operator_map[ldm_key]['slice'][0]]
slice_text = diffusers_state_dict[ldm_diffusers_operator_map[ldm_key]['slice'][1]]
converted_state_dict[ldm_key] = tensor_to_slice[get_slices_from_string(slice_text)].detach().to(device,
dtype=dtype)
diffusers_matched_keys.extend(ldm_diffusers_operator_map[ldm_key]['slice'])
ldm_matched_keys.append(ldm_key)
# process the rest of the keys
for ldm_key in ldm_diffusers_keymap:
@@ -67,6 +75,22 @@ def convert_state_dict_to_ldm_with_mapping(
if ldm_key in ldm_diffusers_shape_map:
tensor = tensor.view(ldm_diffusers_shape_map[ldm_key][0])
converted_state_dict[ldm_key] = tensor
diffusers_matched_keys.append(ldm_diffusers_keymap[ldm_key])
ldm_matched_keys.append(ldm_key)
# see if any are missing from know mapping
mapped_diffusers_keys = list(ldm_diffusers_keymap.values())
mapped_ldm_keys = list(ldm_diffusers_keymap.keys())
missing_diffusers_keys = [x for x in mapped_diffusers_keys if x not in diffusers_matched_keys]
missing_ldm_keys = [x for x in mapped_ldm_keys if x not in ldm_matched_keys]
if len(missing_diffusers_keys) > 0:
print(f"WARNING!!!! Missing {len(missing_diffusers_keys)} diffusers keys")
print(missing_diffusers_keys)
if len(missing_ldm_keys) > 0:
print(f"WARNING!!!! Missing {len(missing_ldm_keys)} ldm keys")
print(missing_ldm_keys)
return converted_state_dict

View File

@@ -1,4 +1,5 @@
import gc
import json
import typing
from typing import Union, List, Tuple
import sys
@@ -15,7 +16,7 @@ from library.model_util import convert_unet_state_dict_to_sd, convert_text_encod
from toolkit import train_tools
from toolkit.config_modules import ModelConfig, GenerateImageConfig
from toolkit.metadata import get_meta_for_safetensors
from toolkit.paths import REPOS_ROOT
from toolkit.paths import REPOS_ROOT, KEYMAPS_ROOT
from toolkit.prompt_utils import inject_trigger_into_prompt, PromptEmbeds
from toolkit.saving import save_ldm_model_from_diffusers
from toolkit.train_tools import get_torch_dtype, apply_noise_offset
@@ -37,6 +38,14 @@ SD_PREFIX_TEXT_ENCODER = "te"
SD_PREFIX_TEXT_ENCODER1 = "te1"
SD_PREFIX_TEXT_ENCODER2 = "te2"
# prefixed diffusers keys
DO_NOT_TRAIN_WEIGHTS = [
"unet_time_embedding.linear_1.bias",
"unet_time_embedding.linear_1.weight",
"unet_time_embedding.linear_2.bias",
"unet_time_embedding.linear_2.weight",
]
class BlankNetwork:
@@ -63,10 +72,6 @@ def flush():
UNET_IN_CHANNELS = 4 # Stable Diffusion の in_channels は 4 で固定。XLも同じ。
VAE_SCALE_FACTOR = 8 # 2 ** (len(vae.config.block_out_channels) - 1) = 8
# if is type checking
if typing.TYPE_CHECKING:
from diffusers import \
@@ -734,3 +739,49 @@ class StableDiffusion:
save_dtype=save_dtype,
sd_version=version_string,
)
def prepare_optimizer_params(
self,
unet=False,
text_encoder=False,
text_encoder_lr=None,
unet_lr=None,
default_lr=1e-6,
):
# todo maybe only get locon ones?
# not all items are saved, to make it match, we need to match out save mappings
# and not train anything not mapped. Also add learning rate
version = 'sd1'
if self.is_xl:
version = 'sdxl'
if self.is_v2:
version = 'sd2'
mapping_filename = f"stable_diffusion_{version}.json"
mapping_path = os.path.join(KEYMAPS_ROOT, mapping_filename)
with open(mapping_path, 'r') as f:
mapping = json.load(f)
ldm_diffusers_keymap = mapping['ldm_diffusers_keymap']
trainable_parameters = []
if unet:
state_dict = self.state_dict(vae=False, unet=unet, text_encoder=False)
unet_lr = unet_lr if unet_lr is not None else default_lr
params = []
for key, diffusers_key in ldm_diffusers_keymap.items():
if diffusers_key in state_dict and diffusers_key not in DO_NOT_TRAIN_WEIGHTS:
params.append(state_dict[diffusers_key])
param_data = {"params": params, "lr": unet_lr}
trainable_parameters.append(param_data)
if text_encoder:
state_dict = self.state_dict(vae=False, unet=unet, text_encoder=text_encoder)
text_encoder_lr = text_encoder_lr if text_encoder_lr is not None else default_lr
params = []
for key, diffusers_key in ldm_diffusers_keymap.items():
if diffusers_key in state_dict and diffusers_key not in DO_NOT_TRAIN_WEIGHTS:
params.append(state_dict[diffusers_key])
param_data = {"params": params, "lr": text_encoder_lr}
trainable_parameters.append(param_data)
return trainable_parameters