diff --git a/extensions_built_in/sd_trainer/SDTrainer.py b/extensions_built_in/sd_trainer/SDTrainer.py index 7ba5862f..3a0bca57 100644 --- a/extensions_built_in/sd_trainer/SDTrainer.py +++ b/extensions_built_in/sd_trainer/SDTrainer.py @@ -989,7 +989,7 @@ class SDTrainer(BaseSDTrainProcess): # training with assistance, we want it low # adapter_strength_min = 0.4 # adapter_strength_max = 0.7 - adapter_strength_min = 0.9 + adapter_strength_min = 0.5 adapter_strength_max = 1.1 adapter_conditioning_scale = torch.rand( diff --git a/toolkit/stable_diffusion_model.py b/toolkit/stable_diffusion_model.py index 5e4ad77c..9b14b84a 100644 --- a/toolkit/stable_diffusion_model.py +++ b/toolkit/stable_diffusion_model.py @@ -8,7 +8,7 @@ from typing import Union, List, Literal, Iterator import sys import os from collections import OrderedDict - +import copy import yaml from PIL import Image from diffusers.pipelines.pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_1024_BIN, ASPECT_RATIO_512_BIN, ASPECT_RATIO_2048_BIN, ASPECT_RATIO_256_BIN @@ -423,6 +423,8 @@ class StableDiffusion: self.vae: 'AutoencoderKL' = pipe.vae.to(self.device_torch, dtype=dtype) self.vae.eval() self.vae.requires_grad_(False) + VAE_SCALE_FACTOR = 2 ** (len(self.vae.config['block_out_channels']) - 1) + self.vae_scale_factor = VAE_SCALE_FACTOR self.unet.to(self.device_torch, dtype=dtype) self.unet.requires_grad_(False) self.unet.eval() @@ -439,6 +441,19 @@ class StableDiffusion: self.load_refiner() self.is_loaded = True + if self.is_pixart and self.vae_scale_factor == 16: + # TODO make our own pipeline? + # we generate an image 2x larger, so we need to copy the sizes from larger ones down + # ASPECT_RATIO_1024_BIN, ASPECT_RATIO_512_BIN, ASPECT_RATIO_2048_BIN, ASPECT_RATIO_256_BIN + for key in ASPECT_RATIO_256_BIN.keys(): + ASPECT_RATIO_256_BIN[key] = [ASPECT_RATIO_256_BIN[key][0] * 2, ASPECT_RATIO_256_BIN[key][1] * 2] + for key in ASPECT_RATIO_512_BIN.keys(): + ASPECT_RATIO_512_BIN[key] = [ASPECT_RATIO_512_BIN[key][0] * 2, ASPECT_RATIO_512_BIN[key][1] * 2] + for key in ASPECT_RATIO_1024_BIN.keys(): + ASPECT_RATIO_1024_BIN[key] = [ASPECT_RATIO_1024_BIN[key][0] * 2, ASPECT_RATIO_1024_BIN[key][1] * 2] + for key in ASPECT_RATIO_2048_BIN.keys(): + ASPECT_RATIO_2048_BIN[key] = [ASPECT_RATIO_2048_BIN[key][0] * 2, ASPECT_RATIO_2048_BIN[key][1] * 2] + def te_train(self): if isinstance(self.text_encoder, list): for te in self.text_encoder: @@ -1250,7 +1265,7 @@ class StableDiffusion: height, width = self.pipeline.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin) added_cond_kwargs = {"resolution": None, "aspect_ratio": None} - if self.unet.config.sample_size == 128: + if self.unet.config.sample_size == 128 or (self.vae_scale_factor == 16 and self.unet.config.sample_size == 64): resolution = torch.tensor([height, width]).repeat(batch_size, 1) aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size, 1) resolution = resolution.to(dtype=text_embeddings.text_embeds.dtype, device=self.device_torch)