Files
ai-toolkit/toolkit/train_tools.py

624 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import argparse
import hashlib
import json
import os
import time
from typing import TYPE_CHECKING, Union
import sys
from toolkit.paths import SD_SCRIPTS_ROOT
sys.path.append(SD_SCRIPTS_ROOT)
from diffusers import (
StableDiffusionPipeline,
DDPMScheduler,
EulerAncestralDiscreteScheduler,
DPMSolverMultistepScheduler,
DPMSolverSinglestepScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
DDIMScheduler,
EulerDiscreteScheduler,
HeunDiscreteScheduler,
KDPM2DiscreteScheduler,
KDPM2AncestralDiscreteScheduler,
)
from library.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline
import torch
import re
SCHEDULER_LINEAR_START = 0.00085
SCHEDULER_LINEAR_END = 0.0120
SCHEDULER_TIMESTEPS = 1000
SCHEDLER_SCHEDULE = "scaled_linear"
UNET_ATTENTION_TIME_EMBED_DIM = 256 # XL
TEXT_ENCODER_2_PROJECTION_DIM = 1280
UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM = 2816
def get_torch_dtype(dtype_str):
# if it is a torch dtype, return it
if isinstance(dtype_str, torch.dtype):
return dtype_str
if dtype_str == "float" or dtype_str == "fp32" or dtype_str == "single" or dtype_str == "float32":
return torch.float
if dtype_str == "fp16" or dtype_str == "half" or dtype_str == "float16":
return torch.float16
if dtype_str == "bf16" or dtype_str == "bfloat16":
return torch.bfloat16
return dtype_str
def replace_filewords_prompt(prompt, args: argparse.Namespace):
# if name_replace attr in args (may not be)
if hasattr(args, "name_replace") and args.name_replace is not None:
# replace [name] to args.name_replace
prompt = prompt.replace("[name]", args.name_replace)
if hasattr(args, "prepend") and args.prepend is not None:
# prepend to every item in prompt file
prompt = args.prepend + ' ' + prompt
if hasattr(args, "append") and args.append is not None:
# append to every item in prompt file
prompt = prompt + ' ' + args.append
return prompt
def replace_filewords_in_dataset_group(dataset_group, args: argparse.Namespace):
# if name_replace attr in args (may not be)
if hasattr(args, "name_replace") and args.name_replace is not None:
if not len(dataset_group.image_data) > 0:
# throw error
raise ValueError("dataset_group.image_data is empty")
for key in dataset_group.image_data:
dataset_group.image_data[key].caption = dataset_group.image_data[key].caption.replace(
"[name]", args.name_replace)
return dataset_group
def get_seeds_from_latents(latents):
# latents shape = (batch_size, 4, height, width)
# for speed we only use 8x8 slice of the first channel
seeds = []
# split batch up
for i in range(latents.shape[0]):
# use only first channel, multiply by 255 and convert to int
tensor = latents[i, 0, :, :] * 255.0 # shape = (height, width)
# slice 8x8
tensor = tensor[:8, :8]
# clip to 0-255
tensor = torch.clamp(tensor, 0, 255)
# convert to 8bit int
tensor = tensor.to(torch.uint8)
# convert to bytes
tensor_bytes = tensor.cpu().numpy().tobytes()
# hash
hash_object = hashlib.sha256(tensor_bytes)
# get hex
hex_dig = hash_object.hexdigest()
# convert to int
seed = int(hex_dig, 16) % (2 ** 32)
# append
seeds.append(seed)
return seeds
def get_noise_from_latents(latents):
seed_list = get_seeds_from_latents(latents)
noise = []
for seed in seed_list:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
noise.append(torch.randn_like(latents[0]))
return torch.stack(noise)
# mix 0 is completely noise mean, mix 1 is completely target mean
def match_noise_to_target_mean_offset(noise, target, mix=0.5, dim=None):
dim = dim or (1, 2, 3)
# reduce mean of noise on dim 2, 3, keeping 0 and 1 intact
noise_mean = noise.mean(dim=dim, keepdim=True)
target_mean = target.mean(dim=dim, keepdim=True)
new_noise_mean = mix * target_mean + (1 - mix) * noise_mean
noise = noise - noise_mean + new_noise_mean
return noise
def sample_images(
accelerator,
args: argparse.Namespace,
epoch,
steps,
device,
vae,
tokenizer,
text_encoder,
unet,
prompt_replacement=None,
force_sample=False
):
"""
StableDiffusionLongPromptWeightingPipelineの改造版を使うようにしたので、clip skipおよびプロンプトの重みづけに対応した
"""
if not force_sample:
if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
return
if args.sample_every_n_epochs is not None:
# sample_every_n_steps は無視する
if epoch is None or epoch % args.sample_every_n_epochs != 0:
return
else:
if steps % args.sample_every_n_steps != 0 or epoch is not None: # steps is not divisible or end of epoch
return
is_sample_only = args.sample_only
is_generating_only = hasattr(args, "is_generating_only") and args.is_generating_only
print(f"\ngenerating sample images at step / サンプル画像生成 ステップ: {steps}")
if not os.path.isfile(args.sample_prompts):
print(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
return
org_vae_device = vae.device # CPUにいるはず
vae.to(device)
# read prompts
# with open(args.sample_prompts, "rt", encoding="utf-8") as f:
# prompts = f.readlines()
if args.sample_prompts.endswith(".txt"):
with open(args.sample_prompts, "r", encoding="utf-8") as f:
lines = f.readlines()
prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
elif args.sample_prompts.endswith(".json"):
with open(args.sample_prompts, "r", encoding="utf-8") as f:
prompts = json.load(f)
# schedulerを用意する
sched_init_args = {}
if args.sample_sampler == "ddim":
scheduler_cls = DDIMScheduler
elif args.sample_sampler == "ddpm": # ddpmはおかしくなるのでoptionから外してある
scheduler_cls = DDPMScheduler
elif args.sample_sampler == "pndm":
scheduler_cls = PNDMScheduler
elif args.sample_sampler == "lms" or args.sample_sampler == "k_lms":
scheduler_cls = LMSDiscreteScheduler
elif args.sample_sampler == "euler" or args.sample_sampler == "k_euler":
scheduler_cls = EulerDiscreteScheduler
elif args.sample_sampler == "euler_a" or args.sample_sampler == "k_euler_a":
scheduler_cls = EulerAncestralDiscreteScheduler
elif args.sample_sampler == "dpmsolver" or args.sample_sampler == "dpmsolver++":
scheduler_cls = DPMSolverMultistepScheduler
sched_init_args["algorithm_type"] = args.sample_sampler
elif args.sample_sampler == "dpmsingle":
scheduler_cls = DPMSolverSinglestepScheduler
elif args.sample_sampler == "heun":
scheduler_cls = HeunDiscreteScheduler
elif args.sample_sampler == "dpm_2" or args.sample_sampler == "k_dpm_2":
scheduler_cls = KDPM2DiscreteScheduler
elif args.sample_sampler == "dpm_2_a" or args.sample_sampler == "k_dpm_2_a":
scheduler_cls = KDPM2AncestralDiscreteScheduler
else:
scheduler_cls = DDIMScheduler
if args.v_parameterization:
sched_init_args["prediction_type"] = "v_prediction"
scheduler = scheduler_cls(
num_train_timesteps=SCHEDULER_TIMESTEPS,
beta_start=SCHEDULER_LINEAR_START,
beta_end=SCHEDULER_LINEAR_END,
beta_schedule=SCHEDLER_SCHEDULE,
**sched_init_args,
)
# clip_sample=Trueにする
if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is False:
# print("set clip_sample to True")
scheduler.config.clip_sample = True
pipeline = StableDiffusionLongPromptWeightingPipeline(
text_encoder=text_encoder,
vae=vae,
unet=unet,
tokenizer=tokenizer,
scheduler=scheduler,
clip_skip=args.clip_skip,
safety_checker=None,
feature_extractor=None,
requires_safety_checker=False,
)
pipeline.to(device)
if is_generating_only:
save_dir = args.output_dir
else:
save_dir = args.output_dir + "/sample"
os.makedirs(save_dir, exist_ok=True)
rng_state = torch.get_rng_state()
cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
with torch.no_grad():
with accelerator.autocast():
for i, prompt in enumerate(prompts):
if not accelerator.is_main_process:
continue
if isinstance(prompt, dict):
negative_prompt = prompt.get("negative_prompt")
sample_steps = prompt.get("sample_steps", 30)
width = prompt.get("width", 512)
height = prompt.get("height", 512)
scale = prompt.get("scale", 7.5)
seed = prompt.get("seed")
prompt = prompt.get("prompt")
prompt = replace_filewords_prompt(prompt, args)
negative_prompt = replace_filewords_prompt(negative_prompt, args)
else:
prompt = replace_filewords_prompt(prompt, args)
# prompt = prompt.strip()
# if len(prompt) == 0 or prompt[0] == "#":
# continue
# subset of gen_img_diffusers
prompt_args = prompt.split(" --")
prompt = prompt_args[0]
negative_prompt = None
sample_steps = 30
width = height = 512
scale = 7.5
seed = None
for parg in prompt_args:
try:
m = re.match(r"w (\d+)", parg, re.IGNORECASE)
if m:
width = int(m.group(1))
continue
m = re.match(r"h (\d+)", parg, re.IGNORECASE)
if m:
height = int(m.group(1))
continue
m = re.match(r"d (\d+)", parg, re.IGNORECASE)
if m:
seed = int(m.group(1))
continue
m = re.match(r"s (\d+)", parg, re.IGNORECASE)
if m: # steps
sample_steps = max(1, min(1000, int(m.group(1))))
continue
m = re.match(r"l ([\d\.]+)", parg, re.IGNORECASE)
if m: # scale
scale = float(m.group(1))
continue
m = re.match(r"n (.+)", parg, re.IGNORECASE)
if m: # negative prompt
negative_prompt = m.group(1)
continue
except ValueError as ex:
print(f"Exception in parsing / 解析エラー: {parg}")
print(ex)
if seed is not None:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
if prompt_replacement is not None:
prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
if negative_prompt is not None:
negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
height = max(64, height - height % 8) # round to divisible by 8
width = max(64, width - width % 8) # round to divisible by 8
print(f"prompt: {prompt}")
print(f"negative_prompt: {negative_prompt}")
print(f"height: {height}")
print(f"width: {width}")
print(f"sample_steps: {sample_steps}")
print(f"scale: {scale}")
image = pipeline(
prompt=prompt,
height=height,
width=width,
num_inference_steps=sample_steps,
guidance_scale=scale,
negative_prompt=negative_prompt,
).images[0]
ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
seed_suffix = "" if seed is None else f"_{seed}"
if is_generating_only:
img_filename = (
f"{'' if args.output_name is None else args.output_name + '_'}{ts_str}_{num_suffix}_{i:02d}{seed_suffix}.png"
)
else:
img_filename = (
f"{'' if args.output_name is None else args.output_name + '_'}{ts_str}_{i:04d}{seed_suffix}.png"
)
if is_sample_only:
# make prompt txt file
img_path_no_ext = os.path.join(save_dir, img_filename[:-4])
with open(img_path_no_ext + ".txt", "w") as f:
# put prompt in txt file
f.write(prompt)
# close file
f.close()
image.save(os.path.join(save_dir, img_filename))
# wandb有効時のみログを送信
try:
wandb_tracker = accelerator.get_tracker("wandb")
try:
import wandb
except ImportError: # 事前に一度確認するのでここはエラー出ないはず
raise ImportError("No wandb / wandb がインストールされていないようです")
wandb_tracker.log({f"sample_{i}": wandb.Image(image)})
except: # wandb 無効時
pass
# clear pipeline and cache to reduce vram usage
del pipeline
torch.cuda.empty_cache()
torch.set_rng_state(rng_state)
if cuda_rng_state is not None:
torch.cuda.set_rng_state(cuda_rng_state)
vae.to(org_vae_device)
# https://www.crosslabs.org//blog/diffusion-with-offset-noise
def apply_noise_offset(noise, noise_offset):
if noise_offset is None or noise_offset < 0.0000001:
return noise
noise = noise + noise_offset * torch.randn((noise.shape[0], noise.shape[1], 1, 1), device=noise.device)
return noise
if TYPE_CHECKING:
from toolkit.stable_diffusion_model import PromptEmbeds
def concat_prompt_embeddings(
unconditional: 'PromptEmbeds',
conditional: 'PromptEmbeds',
n_imgs: int,
):
from toolkit.stable_diffusion_model import PromptEmbeds
text_embeds = torch.cat(
[unconditional.text_embeds, conditional.text_embeds]
).repeat_interleave(n_imgs, dim=0)
pooled_embeds = None
if unconditional.pooled_embeds is not None and conditional.pooled_embeds is not None:
pooled_embeds = torch.cat(
[unconditional.pooled_embeds, conditional.pooled_embeds]
).repeat_interleave(n_imgs, dim=0)
return PromptEmbeds([text_embeds, pooled_embeds])
def addnet_hash_safetensors(b):
"""New model hash used by sd-webui-additional-networks for .safetensors format files"""
hash_sha256 = hashlib.sha256()
blksize = 1024 * 1024
b.seek(0)
header = b.read(8)
n = int.from_bytes(header, "little")
offset = n + 8
b.seek(offset)
for chunk in iter(lambda: b.read(blksize), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
def addnet_hash_legacy(b):
"""Old model hash used by sd-webui-additional-networks for .safetensors format files"""
m = hashlib.sha256()
b.seek(0x100000)
m.update(b.read(0x10000))
return m.hexdigest()[0:8]
if TYPE_CHECKING:
from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
def text_tokenize(
tokenizer: 'CLIPTokenizer', # 普通ならひとつ、XLならふたつ
prompts: list[str],
):
return tokenizer(
prompts,
padding="max_length",
max_length=tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
).input_ids
# https://github.com/huggingface/diffusers/blob/78922ed7c7e66c20aa95159c7b7a6057ba7d590d/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L334-L348
def text_encode_xl(
text_encoder: Union['CLIPTextModel', 'CLIPTextModelWithProjection'],
tokens: torch.FloatTensor,
num_images_per_prompt: int = 1,
):
prompt_embeds = text_encoder(
tokens.to(text_encoder.device), output_hidden_states=True
)
pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] # always penultimate layer
bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
return prompt_embeds, pooled_prompt_embeds
def encode_prompts_xl(
tokenizers: list['CLIPTokenizer'],
text_encoders: list[Union['CLIPTextModel', 'CLIPTextModelWithProjection']],
prompts: list[str],
prompts2: Union[list[str], None],
num_images_per_prompt: int = 1,
use_text_encoder_1: bool = True, # sdxl
use_text_encoder_2: bool = True # sdxl
) -> tuple[torch.FloatTensor, torch.FloatTensor]:
# text_encoder and text_encoder_2's penuultimate layer's output
text_embeds_list = []
pooled_text_embeds = None # always text_encoder_2's pool
if prompts2 is None:
prompts2 = prompts
for idx, (tokenizer, text_encoder) in enumerate(zip(tokenizers, text_encoders)):
# todo, we are using a blank string to ignore that encoder for now.
# find a better way to do this (zeroing?, removing it from the unet?)
prompt_list_to_use = prompts if idx == 0 else prompts2
if idx == 0 and not use_text_encoder_1:
prompt_list_to_use = ["" for _ in prompts]
if idx == 1 and not use_text_encoder_2:
prompt_list_to_use = ["" for _ in prompts]
text_tokens_input_ids = text_tokenize(tokenizer, prompt_list_to_use)
text_embeds, pooled_text_embeds = text_encode_xl(
text_encoder, text_tokens_input_ids, num_images_per_prompt
)
text_embeds_list.append(text_embeds)
bs_embed = pooled_text_embeds.shape[0]
pooled_text_embeds = pooled_text_embeds.repeat(1, num_images_per_prompt).view(
bs_embed * num_images_per_prompt, -1
)
return torch.concat(text_embeds_list, dim=-1), pooled_text_embeds
def text_encode(text_encoder: 'CLIPTextModel', tokens):
return text_encoder(tokens.to(text_encoder.device))[0]
def encode_prompts(
tokenizer: 'CLIPTokenizer',
text_encoder: 'CLIPTokenizer',
prompts: list[str],
):
text_tokens = text_tokenize(tokenizer, prompts)
text_embeddings = text_encode(text_encoder, text_tokens)
return text_embeddings
# for XL
def get_add_time_ids(
height: int,
width: int,
dynamic_crops: bool = False,
dtype: torch.dtype = torch.float32,
):
if dynamic_crops:
# random float scale between 1 and 3
random_scale = torch.rand(1).item() * 2 + 1
original_size = (int(height * random_scale), int(width * random_scale))
# random position
crops_coords_top_left = (
torch.randint(0, original_size[0] - height, (1,)).item(),
torch.randint(0, original_size[1] - width, (1,)).item(),
)
target_size = (height, width)
else:
original_size = (height, width)
crops_coords_top_left = (0, 0)
target_size = (height, width)
# this is expected as 6
add_time_ids = list(original_size + crops_coords_top_left + target_size)
# this is expected as 2816
passed_add_embed_dim = (
UNET_ATTENTION_TIME_EMBED_DIM * len(add_time_ids) # 256 * 6
+ TEXT_ENCODER_2_PROJECTION_DIM # + 1280
)
if passed_add_embed_dim != UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM:
raise ValueError(
f"Model expects an added time embedding vector of length {UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
)
add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
return add_time_ids
def concat_embeddings(
unconditional: torch.FloatTensor,
conditional: torch.FloatTensor,
n_imgs: int,
):
return torch.cat([unconditional, conditional]).repeat_interleave(n_imgs, dim=0)
def add_all_snr_to_noise_scheduler(noise_scheduler, device):
if hasattr(noise_scheduler, "all_snr"):
return
# compute it
with torch.no_grad():
alphas_cumprod = noise_scheduler.alphas_cumprod
sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
alpha = sqrt_alphas_cumprod
sigma = sqrt_one_minus_alphas_cumprod
all_snr = (alpha / sigma) ** 2
all_snr.requires_grad = False
noise_scheduler.all_snr = all_snr.to(device)
def get_all_snr(noise_scheduler, device):
if hasattr(noise_scheduler, "all_snr"):
return noise_scheduler.all_snr.to(device)
# compute it
with torch.no_grad():
alphas_cumprod = noise_scheduler.alphas_cumprod
sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
alpha = sqrt_alphas_cumprod
sigma = sqrt_one_minus_alphas_cumprod
all_snr = (alpha / sigma) ** 2
all_snr.requires_grad = False
return all_snr.to(device)
def apply_snr_weight(
loss,
timesteps,
noise_scheduler: Union['DDPMScheduler'],
gamma
):
# will get it form noise scheduler if exist or will calculate it if not
all_snr = get_all_snr(noise_scheduler, loss.device)
snr = torch.stack([all_snr[t] for t in timesteps])
gamma_over_snr = torch.div(torch.ones_like(snr) * gamma, snr)
snr_weight = torch.minimum(gamma_over_snr, torch.ones_like(gamma_over_snr)).float().to(loss.device) # from paper
loss = loss * snr_weight
return loss