From e4de8983c9c20b887ea4dcb342b909a9014de1d3 Mon Sep 17 00:00:00 2001 From: Jaret Burkett Date: Wed, 5 Jul 2023 16:44:58 -0600 Subject: [PATCH] Initial commit --- .gitignore | 171 +++ README.md | 52 + config/examples/locon_config.example.json | 46 + requirements.txt | 6 + scripts/extract_locon.py | 151 +++ toolkit/__init__.py | 0 toolkit/config.py | 39 + toolkit/kohya_model_util.py | 1180 +++++++++++++++++++++ toolkit/lycoris_utils.py | 512 +++++++++ toolkit/metadata.py | 24 + toolkit/paths.py | 4 + 11 files changed, 2185 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config/examples/locon_config.example.json create mode 100644 requirements.txt create mode 100644 scripts/extract_locon.py create mode 100644 toolkit/__init__.py create mode 100644 toolkit/config.py create mode 100644 toolkit/kohya_model_util.py create mode 100644 toolkit/lycoris_utils.py create mode 100644 toolkit/metadata.py create mode 100644 toolkit/paths.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..c894990f --- /dev/null +++ b/.gitignore @@ -0,0 +1,171 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +/env.sh +/models +/custom/* +!/custom/.gitkeep +/.tmp +/venv.bkp +/venv.* +/config/* +!/config/examples +!/config/_PUT_YOUR_CONFIGS_HERE).txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 00000000..f73770b0 --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ +# AI Toolkit by Ostris + +WIP for now, but will be a collection of tools for AI tools as I need them. + +## Installation + +I will try to update this to be more beginner-friendly, but for now I am assuming +a general understanding of python, pip, pytorch, and using virtual environments: + +Linux: + +```bash +pythion3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +Windows: + +```bash +pythion3 -m venv venv +venv\Scripts\activate +pip install -r requirements.txt +``` + +## Current Tools + +### LyCORIS extractor + +It is similar to the [LyCORIS](https://github.com/KohakuBlueleaf/LyCORIS) tool, but adding some QOL features. +It all runs off a config file, which you can find an example of in `config/examples/locon_config.example.json`. +Just copy that file, into the `config` folder, and rename it to `whatever_you_want.json`. +Then you can edit the file to your liking. and call it like so: + +```bash +python3 scripts/extract_locon.py "whatever_you_want" +``` + +You can also put a full path to a config file, if you want to keep it somewhere else. + +```bash +python3 scripts/extract_locon.py "/home/user/whatever_you_want.json" +``` + +File name is auto generated and dumped into the `output` folder. You can put whatever meta you want in the +`meta` section of the config file, and it will be added to the metadata of the output file. I just have +some recommended fields in the example file. The script will add some other useful metadata as well. + +process is an array or different processes to run on the conversion to test. You will normally just need one though. + +Will update this later. + diff --git a/config/examples/locon_config.example.json b/config/examples/locon_config.example.json new file mode 100644 index 00000000..142919af --- /dev/null +++ b/config/examples/locon_config.example.json @@ -0,0 +1,46 @@ +{ + "config": { + "name": "name_of_your_model", + "base_model": "/path/to/base/model", + "extract_model": "/path/to/model/to/extract", + "output_folder": "/path/to/output/folder", + "is_v2": false, + "device": "cpu", + "use_sparse_bias": false, + "sparsity": 0.98, + "disable_cp": false, + "process": [ + { + "mode": "fixed", + "linear_dim": 64, + "conv_dim": 32 + }, + { + "mode": "ratio", + "linear_ratio": 0.2, + "conv_ratio": 0.2 + }, + { + "mode": "quantile", + "linear_quantile": 0.5, + "conv_quantile": 0.5 + } + ] + }, + "meta": { + "name": "[name]", + "description": "A short description of your model", + "trigger_words": [ + "put", + "trigger", + "words", + "here" + ], + "version": "0.1", + "creator": { + "name": "Your Name", + "email": "your@email.com", + "website": "https://yourwebsite.com" + } + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..f5389336 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +torch +safetensors +diffusers +transformers +lycoris_lora +flatten_json \ No newline at end of file diff --git a/scripts/extract_locon.py b/scripts/extract_locon.py new file mode 100644 index 00000000..19b4bcc1 --- /dev/null +++ b/scripts/extract_locon.py @@ -0,0 +1,151 @@ +import json +import os +import sys + +from flatten_json import flatten + +sys.path.insert(0, os.getcwd()) +PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) +CONFIG_FOLDER = os.path.join(PROJECT_ROOT, 'config') +sys.path.append(PROJECT_ROOT) + +import argparse + +from toolkit.lycoris_utils import extract_diff +from toolkit.config import get_config +from toolkit.metadata import create_meta +from toolkit.kohya_model_util import load_models_from_stable_diffusion_checkpoint + +import torch +from safetensors.torch import save_file + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "config_file", + help="Name of config file (eg: person_v1 for config/person_v1.json), or full path if it is not in config folder", + type=str + ) + return parser.parse_args() + + +def main(): + args = get_args() + + config_raw = get_config(args.config_file) + config = config_raw['config'] if 'config' in config_raw else None + if not config: + raise ValueError('config file is invalid. Missing "config" key') + + meta = config_raw['meta'] if 'meta' in config_raw else {} + + def get_conf(key, default=None): + if key in config: + return config[key] + else: + return default + + is_v2 = get_conf('is_v2', False) + name = get_conf('name', None) + base_model = get_conf('base_model') + extract_model = get_conf('extract_model') + output_folder = get_conf('output_folder') + process_list = get_conf('process') + device = get_conf('device', 'cpu') + use_sparse_bias = get_conf('use_sparse_bias', False) + sparsity = get_conf('sparsity', 0.98) + disable_cp = get_conf('disable_cp', False) + + if not name: + raise ValueError('name is required') + if not base_model: + raise ValueError('base_model is required') + if not extract_model: + raise ValueError('extract_model is required') + if not output_folder: + raise ValueError('output_folder is required') + if not process_list or len(process_list) == 0: + raise ValueError('process is required') + + # check processes + for process in process_list: + if process['mode'] == 'fixed': + if not process['linear_dim']: + raise ValueError('linear_dim is required in fixed mode') + if not process['conv_dim']: + raise ValueError('conv_dim is required in fixed mode') + elif process['mode'] == 'threshold': + if not process['linear_threshold']: + raise ValueError('linear_threshold is required in threshold mode') + if not process['conv_threshold']: + raise ValueError('conv_threshold is required in threshold mode') + elif process['mode'] == 'ratio': + if not process['linear_ratio']: + raise ValueError('linear_ratio is required in ratio mode') + if not process['conv_ratio']: + raise ValueError('conv_threshold is required in threshold mode') + elif process['mode'] == 'quantile': + if not process['linear_quantile']: + raise ValueError('linear_quantile is required in quantile mode') + if not process['conv_quantile']: + raise ValueError('conv_quantile is required in quantile mode') + else: + raise ValueError('mode is invalid') + + print(f"Loading base model: {base_model}") + base = load_models_from_stable_diffusion_checkpoint(is_v2, base_model) + print(f"Loading extract model: {extract_model}") + extract = load_models_from_stable_diffusion_checkpoint(is_v2, extract_model) + + print(f"Running {len(process_list)} process{'' if len(process_list) == 1 else 'es'}") + + for process in process_list: + item_meta = json.loads(json.dumps(meta)) + item_meta['process'] = process + if process['mode'] == 'fixed': + linear_mode_param = int(process['linear_dim']) + conv_mode_param = int(process['conv_dim']) + elif process['mode'] == 'threshold': + linear_mode_param = float(process['linear_threshold']) + conv_mode_param = float(process['conv_threshold']) + elif process['mode'] == 'ratio': + linear_mode_param = float(process['linear_ratio']) + conv_mode_param = float(process['conv_ratio']) + elif process['mode'] == 'quantile': + linear_mode_param = float(process['linear_quantile']) + conv_mode_param = float(process['conv_quantile']) + else: + raise ValueError(f"Unknown mode: {process['mode']}") + + print(f"Running process: {process['mode']}, lin: {linear_mode_param}, conv: {conv_mode_param}") + + state_dict, extract_diff_meta = extract_diff( + base, + extract, + process['mode'], + linear_mode_param, + conv_mode_param, + device, + use_sparse_bias, + sparsity, + not disable_cp + ) + + save_meta = create_meta([ + item_meta, extract_diff_meta + ]) + + output_file_name = f"lyco_{name}_{process['mode']}_{linear_mode_param}_{conv_mode_param}.safetensors" + output_path = os.path.join(output_folder, output_file_name) + os.makedirs(output_folder, exist_ok=True) + + # having issues with meta + save_file(state_dict, output_path) + # save_file(state_dict, output_path, {'meta': json.dumps(save_meta, indent=4)}) + + print(f"Saved to {output_path}") + + +if __name__ == '__main__': + main() diff --git a/toolkit/__init__.py b/toolkit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/toolkit/config.py b/toolkit/config.py new file mode 100644 index 00000000..168a2bc1 --- /dev/null +++ b/toolkit/config.py @@ -0,0 +1,39 @@ +import os +import json +from toolkit.paths import TOOLKIT_ROOT + +possible_extensions = ['.json', '.jsonc'] + + +def get_cwd_abs_path(path): + if not os.path.isabs(path): + path = os.path.join(os.getcwd(), path) + return path + + +def get_config(config_file_path): + # first check if it is in the config folder + config_path = os.path.join(TOOLKIT_ROOT, 'config', config_file_path) + # see if it is in the config folder with any of the possible extensions if it doesnt have one + real_config_path = None + if not os.path.exists(config_path): + for ext in possible_extensions: + if os.path.exists(config_path + ext): + real_config_path = config_path + ext + break + + # if we didn't find it there, check if it is a full path + if not real_config_path: + if os.path.exists(config_file_path): + real_config_path = config_file_path + elif os.path.exists(get_cwd_abs_path(config_file_path)): + real_config_path = get_cwd_abs_path(config_file_path) + + if not real_config_path: + raise ValueError(f"Could not find config file {config_file_path}") + + # load the config + with open(real_config_path, 'r') as f: + config = json.load(f) + + return config diff --git a/toolkit/kohya_model_util.py b/toolkit/kohya_model_util.py new file mode 100644 index 00000000..8aba172f --- /dev/null +++ b/toolkit/kohya_model_util.py @@ -0,0 +1,1180 @@ +# mostly from https://github.com/kohya-ss/sd-scripts/blob/main/library/model_util.py +# I am infinitely grateful to @kohya-ss for their amazing work in this field. +# This version is updated to handle the latest version of the diffusers library. + +# v1: split from train_db_fixed.py. +# v2: support safetensors + +import math +import os +import torch +from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig, logging +from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel +from safetensors.torch import load_file, save_file + +# DiffUsers版StableDiffusionのモデルパラメータ +NUM_TRAIN_TIMESTEPS = 1000 +BETA_START = 0.00085 +BETA_END = 0.0120 + +UNET_PARAMS_MODEL_CHANNELS = 320 +UNET_PARAMS_CHANNEL_MULT = [1, 2, 4, 4] +UNET_PARAMS_ATTENTION_RESOLUTIONS = [4, 2, 1] +UNET_PARAMS_IMAGE_SIZE = 64 # fixed from old invalid value `32` +UNET_PARAMS_IN_CHANNELS = 4 +UNET_PARAMS_OUT_CHANNELS = 4 +UNET_PARAMS_NUM_RES_BLOCKS = 2 +UNET_PARAMS_CONTEXT_DIM = 768 +UNET_PARAMS_NUM_HEADS = 8 +# UNET_PARAMS_USE_LINEAR_PROJECTION = False + +VAE_PARAMS_Z_CHANNELS = 4 +VAE_PARAMS_RESOLUTION = 256 +VAE_PARAMS_IN_CHANNELS = 3 +VAE_PARAMS_OUT_CH = 3 +VAE_PARAMS_CH = 128 +VAE_PARAMS_CH_MULT = [1, 2, 4, 4] +VAE_PARAMS_NUM_RES_BLOCKS = 2 + +# V2 +V2_UNET_PARAMS_ATTENTION_HEAD_DIM = [5, 10, 20, 20] +V2_UNET_PARAMS_CONTEXT_DIM = 1024 +# V2_UNET_PARAMS_USE_LINEAR_PROJECTION = True + +# Diffusersの設定を読み込むための参照モデル +DIFFUSERS_REF_MODEL_ID_V1 = "runwayml/stable-diffusion-v1-5" +DIFFUSERS_REF_MODEL_ID_V2 = "stabilityai/stable-diffusion-2-1" + + +# region StableDiffusion->Diffusersの変換コード +# convert_original_stable_diffusion_to_diffusers をコピーして修正している(ASL 2.0) + + +def shave_segments(path, n_shave_prefix_segments=1): + """ + Removes segments. Positive values shave the first segments, negative shave the last segments. + """ + if n_shave_prefix_segments >= 0: + return ".".join(path.split(".")[n_shave_prefix_segments:]) + else: + return ".".join(path.split(".")[:n_shave_prefix_segments]) + + +def renew_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item.replace("in_layers.0", "norm1") + new_item = new_item.replace("in_layers.2", "conv1") + + new_item = new_item.replace("out_layers.0", "norm2") + new_item = new_item.replace("out_layers.3", "conv2") + + new_item = new_item.replace("emb_layers.1", "time_emb_proj") + new_item = new_item.replace("skip_connection", "conv_shortcut") + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + new_item = new_item.replace("nin_shortcut", "conv_shortcut") + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +def renew_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + # new_item = new_item.replace('norm.weight', 'group_norm.weight') + # new_item = new_item.replace('norm.bias', 'group_norm.bias') + + # new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') + # new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') + + # new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + # updated for latest diffusers + new_item = new_item.replace("norm.weight", "group_norm.weight") + new_item = new_item.replace("norm.bias", "group_norm.bias") + + new_item = new_item.replace("q.weight", "to_q.weight") + new_item = new_item.replace("q.bias", "to_q.bias") + + new_item = new_item.replace("k.weight", "to_k.weight") + new_item = new_item.replace("k.bias", "to_k.bias") + + new_item = new_item.replace("v.weight", "to_v.weight") + new_item = new_item.replace("v.bias", "to_v.bias") + + new_item = new_item.replace("proj_out.weight", "to_out.0.weight") + new_item = new_item.replace("proj_out.bias", "to_out.0.bias") + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +def assign_to_checkpoint( + paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None +): + """ + This does the final conversion step: take locally converted weights and apply a global renaming + to them. It splits attention layers, and takes into account additional replacements + that may arise. + + Assigns the weights to the new checkpoint. + """ + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." + + # Splits the attention layers into three variables. + if attention_paths_to_split is not None: + for path, path_map in attention_paths_to_split.items(): + old_tensor = old_checkpoint[path] + channels = old_tensor.shape[0] // 3 + + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) + + num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 + + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) + query, key, value = old_tensor.split(channels // num_heads, dim=1) + + checkpoint[path_map["query"]] = query.reshape(target_shape) + checkpoint[path_map["key"]] = key.reshape(target_shape) + checkpoint[path_map["value"]] = value.reshape(target_shape) + + for path in paths: + new_path = path["new"] + + # These have already been assigned + if attention_paths_to_split is not None and new_path in attention_paths_to_split: + continue + + # Global renaming happens here + new_path = new_path.replace("middle_block.0", "mid_block.resnets.0") + new_path = new_path.replace("middle_block.1", "mid_block.attentions.0") + new_path = new_path.replace("middle_block.2", "mid_block.resnets.1") + + if additional_replacements is not None: + for replacement in additional_replacements: + new_path = new_path.replace(replacement["old"], replacement["new"]) + + # proj_attn.weight has to be converted from conv 1D to linear + is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path) + shape = old_checkpoint[path["old"]].shape + if is_attn_weight and len(shape) == 3: + checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0] + elif is_attn_weight and len(shape) == 4: + checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0] + else: + checkpoint[new_path] = old_checkpoint[path["old"]] + + +def conv_attn_to_linear(checkpoint): + keys = list(checkpoint.keys()) + attn_keys = ["query.weight", "key.weight", "value.weight"] + for key in keys: + if ".".join(key.split(".")[-2:]) in attn_keys: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0, 0] + elif "proj_attn.weight" in key: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0] + + +def linear_transformer_to_conv(checkpoint): + keys = list(checkpoint.keys()) + tf_keys = ["proj_in.weight", "proj_out.weight"] + for key in keys: + if ".".join(key.split(".")[-2:]) in tf_keys: + if checkpoint[key].ndim == 2: + checkpoint[key] = checkpoint[key].unsqueeze(2).unsqueeze(2) + + +def convert_ldm_unet_checkpoint(v2, checkpoint, config): + """ + Takes a state dict and a config, and returns a converted checkpoint. + """ + + # extract state_dict for UNet + unet_state_dict = {} + unet_key = "model.diffusion_model." + keys = list(checkpoint.keys()) + for key in keys: + if key.startswith(unet_key): + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) + + new_checkpoint = {} + + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] + new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] + + new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] + new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"] + new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"] + new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] + + # Retrieves the keys for the input blocks only + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) + input_blocks = { + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key] for layer_id in range(num_input_blocks) + } + + # Retrieves the keys for the middle blocks only + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) + middle_blocks = { + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key] for layer_id in range(num_middle_blocks) + } + + # Retrieves the keys for the output blocks only + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) + output_blocks = { + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key] for layer_id in range(num_output_blocks) + } + + for i in range(1, num_input_blocks): + block_id = (i - 1) // (config["layers_per_block"] + 1) + layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) + + resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] + + if f"input_blocks.{i}.0.op.weight" in unet_state_dict: + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias") + + paths = renew_resnet_paths(resnets) + meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + if len(attentions): + paths = renew_attention_paths(attentions) + meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + resnet_0 = middle_blocks[0] + attentions = middle_blocks[1] + resnet_1 = middle_blocks[2] + + resnet_0_paths = renew_resnet_paths(resnet_0) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + + resnet_1_paths = renew_resnet_paths(resnet_1) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + + attentions_paths = renew_attention_paths(attentions) + meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} + assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + for i in range(num_output_blocks): + block_id = i // (config["layers_per_block"] + 1) + layer_in_block_id = i % (config["layers_per_block"] + 1) + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] + output_block_list = {} + + for layer in output_block_layers: + layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1) + if layer_id in output_block_list: + output_block_list[layer_id].append(layer_name) + else: + output_block_list[layer_id] = [layer_name] + + if len(output_block_list) > 1: + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] + + resnet_0_paths = renew_resnet_paths(resnets) + paths = renew_resnet_paths(resnets) + + meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"} + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + + # オリジナル: + # if ["conv.weight", "conv.bias"] in output_block_list.values(): + # index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) + + # biasとweightの順番に依存しないようにする:もっといいやり方がありそうだが + for l in output_block_list.values(): + l.sort() + + if ["conv.bias", "conv.weight"] in output_block_list.values(): + index = list(output_block_list.values()).index(["conv.bias", "conv.weight"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + + # Clear attentions as they have been attributed above. + if len(attentions) == 2: + attentions = [] + + if len(attentions): + paths = renew_attention_paths(attentions) + meta_path = { + "old": f"output_blocks.{i}.1", + "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", + } + assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config) + else: + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) + for path in resnet_0_paths: + old_path = ".".join(["output_blocks", str(i), path["old"]]) + new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]]) + + new_checkpoint[new_path] = unet_state_dict[old_path] + + # SDのv2では1*1のconv2dがlinearに変わっている + # 誤って Diffusers 側を conv2d のままにしてしまったので、変換必要 + if v2 and not config.get('use_linear_projection', False): + linear_transformer_to_conv(new_checkpoint) + + return new_checkpoint + + +def convert_ldm_vae_checkpoint(checkpoint, config): + # extract state dict for VAE + vae_state_dict = {} + vae_key = "first_stage_model." + keys = list(checkpoint.keys()) + for key in keys: + if key.startswith(vae_key): + vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) + # if len(vae_state_dict) == 0: + # # 渡されたcheckpointは.ckptから読み込んだcheckpointではなくvaeのstate_dict + # vae_state_dict = checkpoint + + new_checkpoint = {} + + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] + + new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] + new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] + + # Retrieves the keys for the encoder down blocks only + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) + down_blocks = {layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)} + + # Retrieves the keys for the decoder up blocks only + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) + up_blocks = {layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)} + + for i in range(num_down_blocks): + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] + + if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + + for i in range(num_up_blocks): + block_id = num_up_blocks - 1 - i + resnets = [key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key] + + if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} + assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) + conv_attn_to_linear(new_checkpoint) + return new_checkpoint + + +def create_unet_diffusers_config(v2, use_linear_projection_in_v2=False): + """ + Creates a config for the diffusers based on the config of the LDM model. + """ + # unet_params = original_config.model.params.unet_config.params + + block_out_channels = [UNET_PARAMS_MODEL_CHANNELS * mult for mult in UNET_PARAMS_CHANNEL_MULT] + + down_block_types = [] + resolution = 1 + for i in range(len(block_out_channels)): + block_type = "CrossAttnDownBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "DownBlock2D" + down_block_types.append(block_type) + if i != len(block_out_channels) - 1: + resolution *= 2 + + up_block_types = [] + for i in range(len(block_out_channels)): + block_type = "CrossAttnUpBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "UpBlock2D" + up_block_types.append(block_type) + resolution //= 2 + + config = dict( + sample_size=UNET_PARAMS_IMAGE_SIZE, + in_channels=UNET_PARAMS_IN_CHANNELS, + out_channels=UNET_PARAMS_OUT_CHANNELS, + down_block_types=tuple(down_block_types), + up_block_types=tuple(up_block_types), + block_out_channels=tuple(block_out_channels), + layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS, + cross_attention_dim=UNET_PARAMS_CONTEXT_DIM if not v2 else V2_UNET_PARAMS_CONTEXT_DIM, + attention_head_dim=UNET_PARAMS_NUM_HEADS if not v2 else V2_UNET_PARAMS_ATTENTION_HEAD_DIM, + # use_linear_projection=UNET_PARAMS_USE_LINEAR_PROJECTION if not v2 else V2_UNET_PARAMS_USE_LINEAR_PROJECTION, + ) + if v2 and use_linear_projection_in_v2: + config["use_linear_projection"] = True + + return config + + +def create_vae_diffusers_config(): + """ + Creates a config for the diffusers based on the config of the LDM model. + """ + # vae_params = original_config.model.params.first_stage_config.params.ddconfig + # _ = original_config.model.params.first_stage_config.params.embed_dim + block_out_channels = [VAE_PARAMS_CH * mult for mult in VAE_PARAMS_CH_MULT] + down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) + up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) + + config = dict( + sample_size=VAE_PARAMS_RESOLUTION, + in_channels=VAE_PARAMS_IN_CHANNELS, + out_channels=VAE_PARAMS_OUT_CH, + down_block_types=tuple(down_block_types), + up_block_types=tuple(up_block_types), + block_out_channels=tuple(block_out_channels), + latent_channels=VAE_PARAMS_Z_CHANNELS, + layers_per_block=VAE_PARAMS_NUM_RES_BLOCKS, + ) + return config + + +def convert_ldm_clip_checkpoint_v1(checkpoint): + keys = list(checkpoint.keys()) + text_model_dict = {} + for key in keys: + if key.startswith("cond_stage_model.transformer"): + text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] + return text_model_dict + + +def convert_ldm_clip_checkpoint_v2(checkpoint, max_length): + # 嫌になるくらい違うぞ! + def convert_key(key): + if not key.startswith("cond_stage_model"): + return None + + # common conversion + key = key.replace("cond_stage_model.model.transformer.", "text_model.encoder.") + key = key.replace("cond_stage_model.model.", "text_model.") + + if "resblocks" in key: + # resblocks conversion + key = key.replace(".resblocks.", ".layers.") + if ".ln_" in key: + key = key.replace(".ln_", ".layer_norm") + elif ".mlp." in key: + key = key.replace(".c_fc.", ".fc1.") + key = key.replace(".c_proj.", ".fc2.") + elif ".attn.out_proj" in key: + key = key.replace(".attn.out_proj.", ".self_attn.out_proj.") + elif ".attn.in_proj" in key: + key = None # 特殊なので後で処理する + else: + raise ValueError(f"unexpected key in SD: {key}") + elif ".positional_embedding" in key: + key = key.replace(".positional_embedding", ".embeddings.position_embedding.weight") + elif ".text_projection" in key: + key = None # 使われない??? + elif ".logit_scale" in key: + key = None # 使われない??? + elif ".token_embedding" in key: + key = key.replace(".token_embedding.weight", ".embeddings.token_embedding.weight") + elif ".ln_final" in key: + key = key.replace(".ln_final", ".final_layer_norm") + return key + + keys = list(checkpoint.keys()) + new_sd = {} + for key in keys: + # remove resblocks 23 + if ".resblocks.23." in key: + continue + new_key = convert_key(key) + if new_key is None: + continue + new_sd[new_key] = checkpoint[key] + + # attnの変換 + for key in keys: + if ".resblocks.23." in key: + continue + if ".resblocks" in key and ".attn.in_proj_" in key: + # 三つに分割 + values = torch.chunk(checkpoint[key], 3) + + key_suffix = ".weight" if "weight" in key else ".bias" + key_pfx = key.replace("cond_stage_model.model.transformer.resblocks.", "text_model.encoder.layers.") + key_pfx = key_pfx.replace("_weight", "") + key_pfx = key_pfx.replace("_bias", "") + key_pfx = key_pfx.replace(".attn.in_proj", ".self_attn.") + new_sd[key_pfx + "q_proj" + key_suffix] = values[0] + new_sd[key_pfx + "k_proj" + key_suffix] = values[1] + new_sd[key_pfx + "v_proj" + key_suffix] = values[2] + + # rename or add position_ids + ANOTHER_POSITION_IDS_KEY = "text_model.encoder.text_model.embeddings.position_ids" + if ANOTHER_POSITION_IDS_KEY in new_sd: + # waifu diffusion v1.4 + position_ids = new_sd[ANOTHER_POSITION_IDS_KEY] + del new_sd[ANOTHER_POSITION_IDS_KEY] + else: + position_ids = torch.Tensor([list(range(max_length))]).to(torch.int64) + + new_sd["text_model.embeddings.position_ids"] = position_ids + return new_sd + + +# endregion + + +# region Diffusers->StableDiffusion の変換コード +# convert_diffusers_to_original_stable_diffusion をコピーして修正している(ASL 2.0) + + +def conv_transformer_to_linear(checkpoint): + keys = list(checkpoint.keys()) + tf_keys = ["proj_in.weight", "proj_out.weight"] + for key in keys: + if ".".join(key.split(".")[-2:]) in tf_keys: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0, 0] + + +def convert_unet_state_dict_to_sd(v2, unet_state_dict): + unet_conversion_map = [ + # (stable-diffusion, HF Diffusers) + ("time_embed.0.weight", "time_embedding.linear_1.weight"), + ("time_embed.0.bias", "time_embedding.linear_1.bias"), + ("time_embed.2.weight", "time_embedding.linear_2.weight"), + ("time_embed.2.bias", "time_embedding.linear_2.bias"), + ("input_blocks.0.0.weight", "conv_in.weight"), + ("input_blocks.0.0.bias", "conv_in.bias"), + ("out.0.weight", "conv_norm_out.weight"), + ("out.0.bias", "conv_norm_out.bias"), + ("out.2.weight", "conv_out.weight"), + ("out.2.bias", "conv_out.bias"), + ] + + unet_conversion_map_resnet = [ + # (stable-diffusion, HF Diffusers) + ("in_layers.0", "norm1"), + ("in_layers.2", "conv1"), + ("out_layers.0", "norm2"), + ("out_layers.3", "conv2"), + ("emb_layers.1", "time_emb_proj"), + ("skip_connection", "conv_shortcut"), + ] + + unet_conversion_map_layer = [] + for i in range(4): + # loop over downblocks/upblocks + + for j in range(2): + # loop over resnets/attentions for downblocks + hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}." + sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0." + unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix)) + + if i < 3: + # no attention layers in down_blocks.3 + hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}." + sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1." + unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix)) + + for j in range(3): + # loop over resnets/attentions for upblocks + hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}." + sd_up_res_prefix = f"output_blocks.{3*i + j}.0." + unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix)) + + if i > 0: + # no attention layers in up_blocks.0 + hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}." + sd_up_atn_prefix = f"output_blocks.{3*i + j}.1." + unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix)) + + if i < 3: + # no downsample in down_blocks.3 + hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv." + sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op." + unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix)) + + # no upsample in up_blocks.3 + hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0." + sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}." + unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix)) + + hf_mid_atn_prefix = "mid_block.attentions.0." + sd_mid_atn_prefix = "middle_block.1." + unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix)) + + for j in range(2): + hf_mid_res_prefix = f"mid_block.resnets.{j}." + sd_mid_res_prefix = f"middle_block.{2*j}." + unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix)) + + # buyer beware: this is a *brittle* function, + # and correct output requires that all of these pieces interact in + # the exact order in which I have arranged them. + mapping = {k: k for k in unet_state_dict.keys()} + for sd_name, hf_name in unet_conversion_map: + mapping[hf_name] = sd_name + for k, v in mapping.items(): + if "resnets" in k: + for sd_part, hf_part in unet_conversion_map_resnet: + v = v.replace(hf_part, sd_part) + mapping[k] = v + for k, v in mapping.items(): + for sd_part, hf_part in unet_conversion_map_layer: + v = v.replace(hf_part, sd_part) + mapping[k] = v + new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()} + + if v2: + conv_transformer_to_linear(new_state_dict) + + return new_state_dict + + +# ================# +# VAE Conversion # +# ================# + + +def reshape_weight_for_sd(w): + # convert HF linear weights to SD conv2d weights + return w.reshape(*w.shape, 1, 1) + + +def convert_vae_state_dict(vae_state_dict): + vae_conversion_map = [ + # (stable-diffusion, HF Diffusers) + ("nin_shortcut", "conv_shortcut"), + ("norm_out", "conv_norm_out"), + ("mid.attn_1.", "mid_block.attentions.0."), + ] + + for i in range(4): + # down_blocks have two resnets + for j in range(2): + hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}." + sd_down_prefix = f"encoder.down.{i}.block.{j}." + vae_conversion_map.append((sd_down_prefix, hf_down_prefix)) + + if i < 3: + hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0." + sd_downsample_prefix = f"down.{i}.downsample." + vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix)) + + hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0." + sd_upsample_prefix = f"up.{3-i}.upsample." + vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix)) + + # up_blocks have three resnets + # also, up blocks in hf are numbered in reverse from sd + for j in range(3): + hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}." + sd_up_prefix = f"decoder.up.{3-i}.block.{j}." + vae_conversion_map.append((sd_up_prefix, hf_up_prefix)) + + # this part accounts for mid blocks in both the encoder and the decoder + for i in range(2): + hf_mid_res_prefix = f"mid_block.resnets.{i}." + sd_mid_res_prefix = f"mid.block_{i+1}." + vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix)) + + vae_conversion_map_attn = [ + # (stable-diffusion, HF Diffusers) + ("norm.", "group_norm."), + ("q.", "query."), + ("k.", "key."), + ("v.", "value."), + ("proj_out.", "proj_attn."), + ] + + mapping = {k: k for k in vae_state_dict.keys()} + for k, v in mapping.items(): + for sd_part, hf_part in vae_conversion_map: + v = v.replace(hf_part, sd_part) + mapping[k] = v + for k, v in mapping.items(): + if "attentions" in k: + for sd_part, hf_part in vae_conversion_map_attn: + v = v.replace(hf_part, sd_part) + mapping[k] = v + new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()} + weights_to_convert = ["q", "k", "v", "proj_out"] + for k, v in new_state_dict.items(): + for weight_name in weights_to_convert: + if f"mid.attn_1.{weight_name}.weight" in k: + # print(f"Reshaping {k} for SD format") + new_state_dict[k] = reshape_weight_for_sd(v) + + return new_state_dict + + +# endregion + +# region 自作のモデル読み書きなど + + +def is_safetensors(path): + return os.path.splitext(path)[1].lower() == ".safetensors" + + +def load_checkpoint_with_text_encoder_conversion(ckpt_path, device="cpu"): + # text encoderの格納形式が違うモデルに対応する ('text_model'がない) + TEXT_ENCODER_KEY_REPLACEMENTS = [ + ("cond_stage_model.transformer.embeddings.", "cond_stage_model.transformer.text_model.embeddings."), + ("cond_stage_model.transformer.encoder.", "cond_stage_model.transformer.text_model.encoder."), + ("cond_stage_model.transformer.final_layer_norm.", "cond_stage_model.transformer.text_model.final_layer_norm."), + ] + + if is_safetensors(ckpt_path): + checkpoint = None + state_dict = load_file(ckpt_path) # , device) # may causes error + else: + checkpoint = torch.load(ckpt_path, map_location=device) + if "state_dict" in checkpoint: + state_dict = checkpoint["state_dict"] + else: + state_dict = checkpoint + checkpoint = None + + key_reps = [] + for rep_from, rep_to in TEXT_ENCODER_KEY_REPLACEMENTS: + for key in state_dict.keys(): + if key.startswith(rep_from): + new_key = rep_to + key[len(rep_from) :] + key_reps.append((key, new_key)) + + for key, new_key in key_reps: + state_dict[new_key] = state_dict[key] + del state_dict[key] + + return checkpoint, state_dict + + +# TODO dtype指定の動作が怪しいので確認する text_encoderを指定形式で作れるか未確認 +def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, device="cpu", dtype=None, unet_use_linear_projection_in_v2=False): + _, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path, device) + + # Convert the UNet2DConditionModel model. + unet_config = create_unet_diffusers_config(v2, unet_use_linear_projection_in_v2) + converted_unet_checkpoint = convert_ldm_unet_checkpoint(v2, state_dict, unet_config) + + unet = UNet2DConditionModel(**unet_config).to(device) + info = unet.load_state_dict(converted_unet_checkpoint) + print("loading u-net:", info) + + # Convert the VAE model. + vae_config = create_vae_diffusers_config() + converted_vae_checkpoint = convert_ldm_vae_checkpoint(state_dict, vae_config) + + vae = AutoencoderKL(**vae_config).to(device) + info = vae.load_state_dict(converted_vae_checkpoint) + print("loading vae:", info) + + # convert text_model + if v2: + converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v2(state_dict, 77) + cfg = CLIPTextConfig( + vocab_size=49408, + hidden_size=1024, + intermediate_size=4096, + num_hidden_layers=23, + num_attention_heads=16, + max_position_embeddings=77, + hidden_act="gelu", + layer_norm_eps=1e-05, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + model_type="clip_text_model", + projection_dim=512, + torch_dtype="float32", + transformers_version="4.25.0.dev0", + ) + text_model = CLIPTextModel._from_config(cfg) + info = text_model.load_state_dict(converted_text_encoder_checkpoint) + else: + converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v1(state_dict) + + logging.set_verbosity_error() # don't show annoying warning + text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device) + logging.set_verbosity_warning() + + info = text_model.load_state_dict(converted_text_encoder_checkpoint) + print("loading text encoder:", info) + + return text_model, vae, unet + + +def convert_text_encoder_state_dict_to_sd_v2(checkpoint, make_dummy_weights=False): + def convert_key(key): + # position_idsの除去 + if ".position_ids" in key: + return None + + # common + key = key.replace("text_model.encoder.", "transformer.") + key = key.replace("text_model.", "") + if "layers" in key: + # resblocks conversion + key = key.replace(".layers.", ".resblocks.") + if ".layer_norm" in key: + key = key.replace(".layer_norm", ".ln_") + elif ".mlp." in key: + key = key.replace(".fc1.", ".c_fc.") + key = key.replace(".fc2.", ".c_proj.") + elif ".self_attn.out_proj" in key: + key = key.replace(".self_attn.out_proj.", ".attn.out_proj.") + elif ".self_attn." in key: + key = None # 特殊なので後で処理する + else: + raise ValueError(f"unexpected key in DiffUsers model: {key}") + elif ".position_embedding" in key: + key = key.replace("embeddings.position_embedding.weight", "positional_embedding") + elif ".token_embedding" in key: + key = key.replace("embeddings.token_embedding.weight", "token_embedding.weight") + elif "final_layer_norm" in key: + key = key.replace("final_layer_norm", "ln_final") + return key + + keys = list(checkpoint.keys()) + new_sd = {} + for key in keys: + new_key = convert_key(key) + if new_key is None: + continue + new_sd[new_key] = checkpoint[key] + + # attnの変換 + for key in keys: + if "layers" in key and "q_proj" in key: + # 三つを結合 + key_q = key + key_k = key.replace("q_proj", "k_proj") + key_v = key.replace("q_proj", "v_proj") + + value_q = checkpoint[key_q] + value_k = checkpoint[key_k] + value_v = checkpoint[key_v] + value = torch.cat([value_q, value_k, value_v]) + + new_key = key.replace("text_model.encoder.layers.", "transformer.resblocks.") + new_key = new_key.replace(".self_attn.q_proj.", ".attn.in_proj_") + new_sd[new_key] = value + + # 最後の層などを捏造するか + if make_dummy_weights: + print("make dummy weights for resblock.23, text_projection and logit scale.") + keys = list(new_sd.keys()) + for key in keys: + if key.startswith("transformer.resblocks.22."): + new_sd[key.replace(".22.", ".23.")] = new_sd[key].clone() # copyしないとsafetensorsの保存で落ちる + + # Diffusersに含まれない重みを作っておく + new_sd["text_projection"] = torch.ones((1024, 1024), dtype=new_sd[keys[0]].dtype, device=new_sd[keys[0]].device) + new_sd["logit_scale"] = torch.tensor(1) + + return new_sd + + +def save_stable_diffusion_checkpoint(v2, output_file, text_encoder, unet, ckpt_path, epochs, steps, save_dtype=None, vae=None): + if ckpt_path is not None: + # epoch/stepを参照する。またVAEがメモリ上にないときなど、もう一度VAEを含めて読み込む + checkpoint, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path) + if checkpoint is None: # safetensors または state_dictのckpt + checkpoint = {} + strict = False + else: + strict = True + if "state_dict" in state_dict: + del state_dict["state_dict"] + else: + # 新しく作る + assert vae is not None, "VAE is required to save a checkpoint without a given checkpoint" + checkpoint = {} + state_dict = {} + strict = False + + def update_sd(prefix, sd): + for k, v in sd.items(): + key = prefix + k + assert not strict or key in state_dict, f"Illegal key in save SD: {key}" + if save_dtype is not None: + v = v.detach().clone().to("cpu").to(save_dtype) + state_dict[key] = v + + # Convert the UNet model + unet_state_dict = convert_unet_state_dict_to_sd(v2, unet.state_dict()) + update_sd("model.diffusion_model.", unet_state_dict) + + # Convert the text encoder model + if v2: + make_dummy = ckpt_path is None # 参照元のcheckpointがない場合は最後の層を前の層から複製して作るなどダミーの重みを入れる + text_enc_dict = convert_text_encoder_state_dict_to_sd_v2(text_encoder.state_dict(), make_dummy) + update_sd("cond_stage_model.model.", text_enc_dict) + else: + text_enc_dict = text_encoder.state_dict() + update_sd("cond_stage_model.transformer.", text_enc_dict) + + # Convert the VAE + if vae is not None: + vae_dict = convert_vae_state_dict(vae.state_dict()) + update_sd("first_stage_model.", vae_dict) + + # Put together new checkpoint + key_count = len(state_dict.keys()) + new_ckpt = {"state_dict": state_dict} + + # epoch and global_step are sometimes not int + try: + if "epoch" in checkpoint: + epochs += checkpoint["epoch"] + if "global_step" in checkpoint: + steps += checkpoint["global_step"] + except: + pass + + new_ckpt["epoch"] = epochs + new_ckpt["global_step"] = steps + + if is_safetensors(output_file): + # TODO Tensor以外のdictの値を削除したほうがいいか + save_file(state_dict, output_file) + else: + torch.save(new_ckpt, output_file) + + return key_count + + +def save_diffusers_checkpoint(v2, output_dir, text_encoder, unet, pretrained_model_name_or_path, vae=None, use_safetensors=False): + if pretrained_model_name_or_path is None: + # load default settings for v1/v2 + if v2: + pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V2 + else: + pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V1 + + scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler") + tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer") + if vae is None: + vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae") + + pipeline = StableDiffusionPipeline( + unet=unet, + text_encoder=text_encoder, + vae=vae, + scheduler=scheduler, + tokenizer=tokenizer, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=None, + ) + pipeline.save_pretrained(output_dir, safe_serialization=use_safetensors) + + +VAE_PREFIX = "first_stage_model." + + +def load_vae(vae_id, dtype): + print(f"load VAE: {vae_id}") + if os.path.isdir(vae_id) or not os.path.isfile(vae_id): + # Diffusers local/remote + try: + vae = AutoencoderKL.from_pretrained(vae_id, subfolder=None, torch_dtype=dtype) + except EnvironmentError as e: + print(f"exception occurs in loading vae: {e}") + print("retry with subfolder='vae'") + vae = AutoencoderKL.from_pretrained(vae_id, subfolder="vae", torch_dtype=dtype) + return vae + + # local + vae_config = create_vae_diffusers_config() + + if vae_id.endswith(".bin"): + # SD 1.5 VAE on Huggingface + converted_vae_checkpoint = torch.load(vae_id, map_location="cpu") + else: + # StableDiffusion + vae_model = load_file(vae_id, "cpu") if is_safetensors(vae_id) else torch.load(vae_id, map_location="cpu") + vae_sd = vae_model["state_dict"] if "state_dict" in vae_model else vae_model + + # vae only or full model + full_model = False + for vae_key in vae_sd: + if vae_key.startswith(VAE_PREFIX): + full_model = True + break + if not full_model: + sd = {} + for key, value in vae_sd.items(): + sd[VAE_PREFIX + key] = value + vae_sd = sd + del sd + + # Convert the VAE model. + converted_vae_checkpoint = convert_ldm_vae_checkpoint(vae_sd, vae_config) + + vae = AutoencoderKL(**vae_config) + vae.load_state_dict(converted_vae_checkpoint) + return vae + + +# endregion + + +def make_bucket_resolutions(max_reso, min_size=256, max_size=1024, divisible=64): + max_width, max_height = max_reso + max_area = (max_width // divisible) * (max_height // divisible) + + resos = set() + + size = int(math.sqrt(max_area)) * divisible + resos.add((size, size)) + + size = min_size + while size <= max_size: + width = size + height = min(max_size, (max_area // (width // divisible)) * divisible) + resos.add((width, height)) + resos.add((height, width)) + + # # make additional resos + # if width >= height and width - divisible >= min_size: + # resos.add((width - divisible, height)) + # resos.add((height, width - divisible)) + # if height >= width and height - divisible >= min_size: + # resos.add((width, height - divisible)) + # resos.add((height - divisible, width)) + + size += divisible + + resos = list(resos) + resos.sort() + return resos + + +if __name__ == "__main__": + resos = make_bucket_resolutions((512, 768)) + print(len(resos)) + print(resos) + aspect_ratios = [w / h for w, h in resos] + print(aspect_ratios) + + ars = set() + for ar in aspect_ratios: + if ar in ars: + print("error! duplicate ar:", ar) + ars.add(ar) \ No newline at end of file diff --git a/toolkit/lycoris_utils.py b/toolkit/lycoris_utils.py new file mode 100644 index 00000000..a852e09c --- /dev/null +++ b/toolkit/lycoris_utils.py @@ -0,0 +1,512 @@ +# heavily based on https://github.com/KohakuBlueleaf/LyCORIS/blob/main/lycoris/utils.py + +from typing import * + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import torch.linalg as linalg + +from tqdm import tqdm + + +def make_sparse(t: torch.Tensor, sparsity=0.95): + abs_t = torch.abs(t) + np_array = abs_t.detach().cpu().numpy() + quan = float(np.quantile(np_array, sparsity)) + sparse_t = t.masked_fill(abs_t < quan, 0) + return sparse_t + + +def extract_conv( + weight: Union[torch.Tensor, nn.Parameter], + mode='fixed', + mode_param=0, + device='cpu', + is_cp=False, +) -> Tuple[nn.Parameter, nn.Parameter]: + weight = weight.to(device) + out_ch, in_ch, kernel_size, _ = weight.shape + + U, S, Vh = linalg.svd(weight.reshape(out_ch, -1)) + + if mode == 'fixed': + lora_rank = mode_param + elif mode == 'threshold': + assert mode_param >= 0 + lora_rank = torch.sum(S > mode_param) + elif mode == 'ratio': + assert 1 >= mode_param >= 0 + min_s = torch.max(S) * mode_param + lora_rank = torch.sum(S > min_s) + elif mode == 'quantile' or mode == 'percentile': + assert 1 >= mode_param >= 0 + s_cum = torch.cumsum(S, dim=0) + min_cum_sum = mode_param * torch.sum(S) + lora_rank = torch.sum(s_cum < min_cum_sum) + else: + raise NotImplementedError('Extract mode should be "fixed", "threshold", "ratio" or "quantile"') + lora_rank = max(1, lora_rank) + lora_rank = min(out_ch, in_ch, lora_rank) + if lora_rank >= out_ch / 2 and not is_cp: + return weight, 'full' + + U = U[:, :lora_rank] + S = S[:lora_rank] + U = U @ torch.diag(S) + Vh = Vh[:lora_rank, :] + + diff = (weight - (U @ Vh).reshape(out_ch, in_ch, kernel_size, kernel_size)).detach() + extract_weight_A = Vh.reshape(lora_rank, in_ch, kernel_size, kernel_size).detach() + extract_weight_B = U.reshape(out_ch, lora_rank, 1, 1).detach() + del U, S, Vh, weight + return (extract_weight_A, extract_weight_B, diff), 'low rank' + + +def extract_linear( + weight: Union[torch.Tensor, nn.Parameter], + mode='fixed', + mode_param=0, + device='cpu', +) -> Tuple[nn.Parameter, nn.Parameter]: + weight = weight.to(device) + out_ch, in_ch = weight.shape + + U, S, Vh = linalg.svd(weight) + + if mode == 'fixed': + lora_rank = mode_param + elif mode == 'threshold': + assert mode_param >= 0 + lora_rank = torch.sum(S > mode_param) + elif mode == 'ratio': + assert 1 >= mode_param >= 0 + min_s = torch.max(S) * mode_param + lora_rank = torch.sum(S > min_s) + elif mode == 'quantile' or mode == 'percentile': + assert 1 >= mode_param >= 0 + s_cum = torch.cumsum(S, dim=0) + min_cum_sum = mode_param * torch.sum(S) + lora_rank = torch.sum(s_cum < min_cum_sum) + else: + raise NotImplementedError('Extract mode should be "fixed", "threshold", "ratio" or "quantile"') + lora_rank = max(1, lora_rank) + lora_rank = min(out_ch, in_ch, lora_rank) + if lora_rank >= out_ch / 2: + return weight, 'full' + + U = U[:, :lora_rank] + S = S[:lora_rank] + U = U @ torch.diag(S) + Vh = Vh[:lora_rank, :] + + diff = (weight - U @ Vh).detach() + extract_weight_A = Vh.reshape(lora_rank, in_ch).detach() + extract_weight_B = U.reshape(out_ch, lora_rank).detach() + del U, S, Vh, weight + return (extract_weight_A, extract_weight_B, diff), 'low rank' + + +def extract_diff( + base_model, + db_model, + mode='fixed', + linear_mode_param=0, + conv_mode_param=0, + extract_device='cpu', + use_bias=False, + sparsity=0.98, + small_conv=True +): + meta = {} + + UNET_TARGET_REPLACE_MODULE = [ + "Transformer2DModel", + "Attention", + "ResnetBlock2D", + "Downsample2D", + "Upsample2D" + ] + UNET_TARGET_REPLACE_NAME = [ + "conv_in", + "conv_out", + "time_embedding.linear_1", + "time_embedding.linear_2", + ] + TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"] + LORA_PREFIX_UNET = 'lora_unet' + LORA_PREFIX_TEXT_ENCODER = 'lora_te' + + def make_state_dict( + prefix, + root_module: torch.nn.Module, + target_module: torch.nn.Module, + target_replace_modules, + target_replace_names=[] + ): + loras = {} + temp = {} + temp_name = {} + + for name, module in root_module.named_modules(): + if module.__class__.__name__ in target_replace_modules: + temp[name] = {} + for child_name, child_module in module.named_modules(): + if child_module.__class__.__name__ not in {'Linear', 'Conv2d'}: + continue + temp[name][child_name] = child_module.weight + elif name in target_replace_names: + temp_name[name] = module.weight + + for name, module in tqdm(list(target_module.named_modules())): + if name in temp: + weights = temp[name] + for child_name, child_module in module.named_modules(): + lora_name = prefix + '.' + name + '.' + child_name + lora_name = lora_name.replace('.', '_') + layer = child_module.__class__.__name__ + if layer in {'Linear', 'Conv2d'}: + root_weight = child_module.weight + if torch.allclose(root_weight, weights[child_name]): + continue + + if layer == 'Linear': + weight, decompose_mode = extract_linear( + (child_module.weight - weights[child_name]), + mode, + linear_mode_param, + device=extract_device, + ) + if decompose_mode == 'low rank': + extract_a, extract_b, diff = weight + elif layer == 'Conv2d': + is_linear = (child_module.weight.shape[2] == 1 + and child_module.weight.shape[3] == 1) + weight, decompose_mode = extract_conv( + (child_module.weight - weights[child_name]), + mode, + linear_mode_param if is_linear else conv_mode_param, + device=extract_device, + ) + if decompose_mode == 'low rank': + extract_a, extract_b, diff = weight + if small_conv and not is_linear and decompose_mode == 'low rank': + dim = extract_a.size(0) + (extract_c, extract_a, _), _ = extract_conv( + extract_a.transpose(0, 1), + 'fixed', dim, + extract_device, True + ) + extract_a = extract_a.transpose(0, 1) + extract_c = extract_c.transpose(0, 1) + loras[f'{lora_name}.lora_mid.weight'] = extract_c.detach().cpu().contiguous().half() + diff = child_module.weight - torch.einsum( + 'i j k l, j r, p i -> p r k l', + extract_c, extract_a.flatten(1, -1), extract_b.flatten(1, -1) + ).detach().cpu().contiguous() + del extract_c + else: + continue + if decompose_mode == 'low rank': + loras[f'{lora_name}.lora_down.weight'] = extract_a.detach().cpu().contiguous().half() + loras[f'{lora_name}.lora_up.weight'] = extract_b.detach().cpu().contiguous().half() + loras[f'{lora_name}.alpha'] = torch.Tensor([extract_a.shape[0]]).half() + if use_bias: + diff = diff.detach().cpu().reshape(extract_b.size(0), -1) + sparse_diff = make_sparse(diff, sparsity).to_sparse().coalesce() + + indices = sparse_diff.indices().to(torch.int16) + values = sparse_diff.values().half() + loras[f'{lora_name}.bias_indices'] = indices + loras[f'{lora_name}.bias_values'] = values + loras[f'{lora_name}.bias_size'] = torch.tensor(diff.shape).to(torch.int16) + del extract_a, extract_b, diff + elif decompose_mode == 'full': + loras[f'{lora_name}.diff'] = weight.detach().cpu().contiguous().half() + else: + raise NotImplementedError + elif name in temp_name: + weights = temp_name[name] + lora_name = prefix + '.' + name + lora_name = lora_name.replace('.', '_') + layer = module.__class__.__name__ + + if layer in {'Linear', 'Conv2d'}: + root_weight = module.weight + if torch.allclose(root_weight, weights): + continue + + if layer == 'Linear': + weight, decompose_mode = extract_linear( + (root_weight - weights), + mode, + linear_mode_param, + device=extract_device, + ) + if decompose_mode == 'low rank': + extract_a, extract_b, diff = weight + elif layer == 'Conv2d': + is_linear = ( + root_weight.shape[2] == 1 + and root_weight.shape[3] == 1 + ) + weight, decompose_mode = extract_conv( + (root_weight - weights), + mode, + linear_mode_param if is_linear else conv_mode_param, + device=extract_device, + ) + if decompose_mode == 'low rank': + extract_a, extract_b, diff = weight + if small_conv and not is_linear and decompose_mode == 'low rank': + dim = extract_a.size(0) + (extract_c, extract_a, _), _ = extract_conv( + extract_a.transpose(0, 1), + 'fixed', dim, + extract_device, True + ) + extract_a = extract_a.transpose(0, 1) + extract_c = extract_c.transpose(0, 1) + loras[f'{lora_name}.lora_mid.weight'] = extract_c.detach().cpu().contiguous().half() + diff = root_weight - torch.einsum( + 'i j k l, j r, p i -> p r k l', + extract_c, extract_a.flatten(1, -1), extract_b.flatten(1, -1) + ).detach().cpu().contiguous() + del extract_c + else: + continue + if decompose_mode == 'low rank': + loras[f'{lora_name}.lora_down.weight'] = extract_a.detach().cpu().contiguous().half() + loras[f'{lora_name}.lora_up.weight'] = extract_b.detach().cpu().contiguous().half() + loras[f'{lora_name}.alpha'] = torch.Tensor([extract_a.shape[0]]).half() + if use_bias: + diff = diff.detach().cpu().reshape(extract_b.size(0), -1) + sparse_diff = make_sparse(diff, sparsity).to_sparse().coalesce() + + indices = sparse_diff.indices().to(torch.int16) + values = sparse_diff.values().half() + loras[f'{lora_name}.bias_indices'] = indices + loras[f'{lora_name}.bias_values'] = values + loras[f'{lora_name}.bias_size'] = torch.tensor(diff.shape).to(torch.int16) + del extract_a, extract_b, diff + elif decompose_mode == 'full': + loras[f'{lora_name}.diff'] = weight.detach().cpu().contiguous().half() + else: + raise NotImplementedError + return loras + + text_encoder_loras = make_state_dict( + LORA_PREFIX_TEXT_ENCODER, + base_model[0], db_model[0], + TEXT_ENCODER_TARGET_REPLACE_MODULE + ) + + unet_loras = make_state_dict( + LORA_PREFIX_UNET, + base_model[2], db_model[2], + UNET_TARGET_REPLACE_MODULE, + UNET_TARGET_REPLACE_NAME + ) + print(len(text_encoder_loras), len(unet_loras)) + # the | will + return (text_encoder_loras | unet_loras), meta + + +def get_module( + lyco_state_dict: Dict, + lora_name +): + if f'{lora_name}.lora_up.weight' in lyco_state_dict: + up = lyco_state_dict[f'{lora_name}.lora_up.weight'] + down = lyco_state_dict[f'{lora_name}.lora_down.weight'] + mid = lyco_state_dict.get(f'{lora_name}.lora_mid.weight', None) + alpha = lyco_state_dict.get(f'{lora_name}.alpha', None) + return 'locon', (up, down, mid, alpha) + elif f'{lora_name}.hada_w1_a' in lyco_state_dict: + w1a = lyco_state_dict[f'{lora_name}.hada_w1_a'] + w1b = lyco_state_dict[f'{lora_name}.hada_w1_b'] + w2a = lyco_state_dict[f'{lora_name}.hada_w2_a'] + w2b = lyco_state_dict[f'{lora_name}.hada_w2_b'] + t1 = lyco_state_dict.get(f'{lora_name}.hada_t1', None) + t2 = lyco_state_dict.get(f'{lora_name}.hada_t2', None) + alpha = lyco_state_dict.get(f'{lora_name}.alpha', None) + return 'hada', (w1a, w1b, w2a, w2b, t1, t2, alpha) + elif f'{lora_name}.weight' in lyco_state_dict: + weight = lyco_state_dict[f'{lora_name}.weight'] + on_input = lyco_state_dict.get(f'{lora_name}.on_input', False) + return 'ia3', (weight, on_input) + elif (f'{lora_name}.lokr_w1' in lyco_state_dict + or f'{lora_name}.lokr_w1_a' in lyco_state_dict): + w1 = lyco_state_dict.get(f'{lora_name}.lokr_w1', None) + w1a = lyco_state_dict.get(f'{lora_name}.lokr_w1_a', None) + w1b = lyco_state_dict.get(f'{lora_name}.lokr_w1_b', None) + w2 = lyco_state_dict.get(f'{lora_name}.lokr_w2', None) + w2a = lyco_state_dict.get(f'{lora_name}.lokr_w2_a', None) + w2b = lyco_state_dict.get(f'{lora_name}.lokr_w2_b', None) + t1 = lyco_state_dict.get(f'{lora_name}.lokr_t1', None) + t2 = lyco_state_dict.get(f'{lora_name}.lokr_t2', None) + alpha = lyco_state_dict.get(f'{lora_name}.alpha', None) + return 'kron', (w1, w1a, w1b, w2, w2a, w2b, t1, t2, alpha) + elif f'{lora_name}.diff' in lyco_state_dict: + return 'full', lyco_state_dict[f'{lora_name}.diff'] + else: + return 'None', () + + +def cp_weight_from_conv( + up, down, mid +): + up = up.reshape(up.size(0), up.size(1)) + down = down.reshape(down.size(0), down.size(1)) + return torch.einsum('m n w h, i m, n j -> i j w h', mid, up, down) + + +def cp_weight( + wa, wb, t +): + temp = torch.einsum('i j k l, j r -> i r k l', t, wb) + return torch.einsum('i j k l, i r -> r j k l', temp, wa) + + +@torch.no_grad() +def rebuild_weight(module_type, params, orig_weight, scale=1): + if orig_weight is None: + return orig_weight + merged = orig_weight + if module_type == 'locon': + up, down, mid, alpha = params + if alpha is not None: + scale *= alpha / up.size(1) + if mid is not None: + rebuild = cp_weight_from_conv(up, down, mid) + else: + rebuild = up.reshape(up.size(0), -1) @ down.reshape(down.size(0), -1) + merged = orig_weight + rebuild.reshape(orig_weight.shape) * scale + del up, down, mid, alpha, params, rebuild + elif module_type == 'hada': + w1a, w1b, w2a, w2b, t1, t2, alpha = params + if alpha is not None: + scale *= alpha / w1b.size(0) + if t1 is not None: + rebuild1 = cp_weight(w1a, w1b, t1) + else: + rebuild1 = w1a @ w1b + if t2 is not None: + rebuild2 = cp_weight(w2a, w2b, t2) + else: + rebuild2 = w2a @ w2b + rebuild = (rebuild1 * rebuild2).reshape(orig_weight.shape) + merged = orig_weight + rebuild * scale + del w1a, w1b, w2a, w2b, t1, t2, alpha, params, rebuild, rebuild1, rebuild2 + elif module_type == 'ia3': + weight, on_input = params + if not on_input: + weight = weight.reshape(-1, 1) + merged = orig_weight + weight * orig_weight * scale + del weight, on_input, params + elif module_type == 'kron': + w1, w1a, w1b, w2, w2a, w2b, t1, t2, alpha = params + if alpha is not None and (w1b is not None or w2b is not None): + scale *= alpha / (w1b.size(0) if w1b else w2b.size(0)) + if w1a is not None and w1b is not None: + if t1: + w1 = cp_weight(w1a, w1b, t1) + else: + w1 = w1a @ w1b + if w2a is not None and w2b is not None: + if t2: + w2 = cp_weight(w2a, w2b, t2) + else: + w2 = w2a @ w2b + rebuild = torch.kron(w1, w2).reshape(orig_weight.shape) + merged = orig_weight + rebuild * scale + del w1, w1a, w1b, w2, w2a, w2b, t1, t2, alpha, params, rebuild + elif module_type == 'full': + rebuild = params.reshape(orig_weight.shape) + merged = orig_weight + rebuild * scale + del params, rebuild + + return merged + + +def merge( + base_model, + lyco_state_dict, + scale: float = 1.0, + device='cpu' +): + UNET_TARGET_REPLACE_MODULE = [ + "Transformer2DModel", + "Attention", + "ResnetBlock2D", + "Downsample2D", + "Upsample2D" + ] + UNET_TARGET_REPLACE_NAME = [ + "conv_in", + "conv_out", + "time_embedding.linear_1", + "time_embedding.linear_2", + ] + TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"] + LORA_PREFIX_UNET = 'lora_unet' + LORA_PREFIX_TEXT_ENCODER = 'lora_te' + merged = 0 + + def merge_state_dict( + prefix, + root_module: torch.nn.Module, + lyco_state_dict: Dict[str, torch.Tensor], + target_replace_modules, + target_replace_names=[] + ): + nonlocal merged + for name, module in tqdm(list(root_module.named_modules()), desc=f'Merging {prefix}'): + if module.__class__.__name__ in target_replace_modules: + for child_name, child_module in module.named_modules(): + if child_module.__class__.__name__ not in {'Linear', 'Conv2d'}: + continue + lora_name = prefix + '.' + name + '.' + child_name + lora_name = lora_name.replace('.', '_') + + result = rebuild_weight(*get_module( + lyco_state_dict, lora_name + ), getattr(child_module, 'weight'), scale) + if result is not None: + merged += 1 + child_module.requires_grad_(False) + child_module.weight.copy_(result) + elif name in target_replace_names: + lora_name = prefix + '.' + name + lora_name = lora_name.replace('.', '_') + + result = rebuild_weight(*get_module( + lyco_state_dict, lora_name + ), getattr(module, 'weight'), scale) + if result is not None: + merged += 1 + module.requires_grad_(False) + module.weight.copy_(result) + + if device == 'cpu': + for k, v in tqdm(list(lyco_state_dict.items()), desc='Converting Dtype'): + lyco_state_dict[k] = v.float() + + merge_state_dict( + LORA_PREFIX_TEXT_ENCODER, + base_model[0], + lyco_state_dict, + TEXT_ENCODER_TARGET_REPLACE_MODULE, + UNET_TARGET_REPLACE_NAME + ) + merge_state_dict( + LORA_PREFIX_UNET, + base_model[2], + lyco_state_dict, + UNET_TARGET_REPLACE_MODULE, + UNET_TARGET_REPLACE_NAME + ) + print(f'{merged} Modules been merged') diff --git a/toolkit/metadata.py b/toolkit/metadata.py new file mode 100644 index 00000000..44d6a2f0 --- /dev/null +++ b/toolkit/metadata.py @@ -0,0 +1,24 @@ +import json + +software_meta = { + "name": "ai-toolkit", + "url": "https://github.com/ostris/ai-toolkit" +} + + +def create_meta(dict_list, name=None): + meta = {} + for d in dict_list: + for key, value in d.items(): + meta[key] = value + + if "name" not in meta: + meta["name"] = "[name]" + + meta["software"] = software_meta + + # convert to string to handle replacements + meta_string = json.dumps(meta) + if name is not None: + meta_string = meta_string.replace("[name]", name) + return json.loads(meta_string) diff --git a/toolkit/paths.py b/toolkit/paths.py new file mode 100644 index 00000000..01e15ffc --- /dev/null +++ b/toolkit/paths.py @@ -0,0 +1,4 @@ +import os + +TOOLKIT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +CONFIG_ROOT = os.path.join(TOOLKIT_ROOT, 'config')