From bd2bce9b925bd2e3560b9fdbfb0e9d72e053cff9 Mon Sep 17 00:00:00 2001 From: Jaret Burkett Date: Wed, 29 Nov 2023 14:32:48 -0700 Subject: [PATCH] Switched to trailing timestep spacing to make timesteps for consistant across schedulers. Honed in on targeted guidance. It is finally perfect. (I think) --- extensions_built_in/sd_trainer/SDTrainer.py | 60 +++------------------ toolkit/sampler.py | 12 ++--- toolkit/stable_diffusion_model.py | 8 +-- 3 files changed, 12 insertions(+), 68 deletions(-) diff --git a/extensions_built_in/sd_trainer/SDTrainer.py b/extensions_built_in/sd_trainer/SDTrainer.py index 18aba214..93bee42d 100644 --- a/extensions_built_in/sd_trainer/SDTrainer.py +++ b/extensions_built_in/sd_trainer/SDTrainer.py @@ -189,40 +189,15 @@ class SDTrainer(BaseSDTrainProcess): ): with torch.no_grad(): # Perform targeted guidance (working title) - conditional_noisy_latents = noisy_latents # target images + conditional_noisy_latents = noisy_latents.detach() # target images dtype = get_torch_dtype(self.train_config.dtype) if batch.unconditional_latents is not None: # unconditional latents are the "neutral" images. Add noise here identical to # the noise added to the conditional latents, at the same timesteps - # unconditional_noisy_latents = self.sd.noise_scheduler.add_noise( - # batch.unconditional_latents, noise, timesteps - # ) - unconditional_noisy_latents = self.sd.add_noise(batch.unconditional_latents, noise, timesteps) - # calculate the differential between our conditional (target image) and out unconditional (neutral image) - target_differential_noise = unconditional_noisy_latents - conditional_noisy_latents - target_differential_noise = target_differential_noise.detach() + unconditional_noisy_latents = self.sd.add_noise(batch.unconditional_latents, noise, timesteps).detach() - # Calculate the mean along dim=1, keep dimensions - mean_chan = torch.abs(torch.mean(target_differential_noise, dim=1, keepdim=True)) - - # Create a mask with 0s where values are between 0.0 and 0.01, otherwise 1s - mask = torch.where((mean_chan >= 0.0) & (mean_chan <= 0.01), 0.0, 1.0) - - # Duplicate the mask along dim 1 to match the shape of target_differential_noise - mask = mask.expand_as(target_differential_noise) - # this mask is now a 1 for our target differential and 0 for everything else - - # add the target differential to the target latents as if it were noise with the scheduler, scaled to - # the current timestep. Scaling the noise here is important as it scales our guidance to the current - # timestep. This is the key to making the guidance work. - # guidance_latents = self.sd.noise_scheduler.add_noise( - # conditional_noisy_latents, - # target_differential_noise, - # timesteps - # ) - guidance_latents = self.sd.add_noise(conditional_noisy_latents, target_differential_noise, timesteps) # Disable the LoRA network so we can predict parent network knowledge without it self.network.is_active = False @@ -231,7 +206,7 @@ class SDTrainer(BaseSDTrainProcess): # Predict noise to get a baseline of what the parent network wants to do with the latents + noise. # This acts as our control to preserve the unaltered parts of the image. baseline_prediction = self.sd.predict_noise( - latents=guidance_latents.to(self.device_torch, dtype=dtype).detach(), + latents=unconditional_noisy_latents.to(self.device_torch, dtype=dtype).detach(), conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype).detach(), timestep=timesteps, guidance_scale=1.0, @@ -245,42 +220,19 @@ class SDTrainer(BaseSDTrainProcess): # do our prediction with LoRA active on the scaled guidance latents prediction = self.sd.predict_noise( - latents=guidance_latents.to(self.device_torch, dtype=dtype).detach(), + latents=conditional_noisy_latents.to(self.device_torch, dtype=dtype).detach(), conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype).detach(), timestep=timesteps, guidance_scale=1.0, **pred_kwargs # adapter residuals in here ) - # remove the baseline prediction from our prediction to get the differential between the two - # all that should be left is the differential between the conditional and unconditional images - pred_differential_noise = prediction - baseline_prediction - - # for loss, we target ONLY the unscaled differential between our conditional and unconditional latents - # not the timestep scaled noise that was added. This is the diffusion training process. - # This will guide the network to make identical predictions it previously did for everything EXCEPT our - # differential between the conditional and unconditional images (target) loss = torch.nn.functional.mse_loss( - pred_differential_noise.float(), - target_differential_noise.float(), - reduction="none" - ) - - # multiply by our mask - loss = loss * mask - loss = loss.mean([1, 2, 3]) - # calculate inverse to match baseline prediction - unmasked_prior_loss = torch.nn.functional.mse_loss( - baseline_prediction.float(), prediction.float(), + baseline_prediction.float(), reduction="none" ) - # multiply by our mask - unmasked_prior_loss = unmasked_prior_loss * (1.0 - mask) - # add the unmasked prior loss to the masked loss - unmasked_prior_loss = unmasked_prior_loss.mean([1, 2, 3]) - loss = loss + unmasked_prior_loss - + loss = loss.mean([1, 2, 3]) loss = self.apply_snr(loss, timesteps) loss = loss.mean() diff --git a/toolkit/sampler.py b/toolkit/sampler.py index b2ba6646..6ccc621b 100644 --- a/toolkit/sampler.py +++ b/toolkit/sampler.py @@ -26,8 +26,8 @@ SCHEDULER_TIMESTEPS = 1000 SCHEDLER_SCHEDULE = "scaled_linear" sdxl_sampler_config = { - "_class_name": "EulerDiscreteScheduler", - "_diffusers_version": "0.19.0.dev0", + "_class_name": "EulerAncestralDiscreteScheduler", + "_diffusers_version": "0.24.0.dev0", "beta_end": 0.012, "beta_schedule": "scaled_linear", "beta_start": 0.00085, @@ -37,11 +37,10 @@ sdxl_sampler_config = { "prediction_type": "epsilon", "sample_max_value": 1.0, "set_alpha_to_one": False, - "skip_prk_steps": True, + "skip_prk_steps": False, "steps_offset": 1, - "timestep_spacing": "leading", - "trained_betas": None, - "use_karras_sigmas": False + "timestep_spacing": "trailing", + "trained_betas": None } @@ -86,7 +85,6 @@ def get_sampler( scheduler = scheduler_cls.from_config(config) - return scheduler diff --git a/toolkit/stable_diffusion_model.py b/toolkit/stable_diffusion_model.py index a608a02b..c2701bb3 100644 --- a/toolkit/stable_diffusion_model.py +++ b/toolkit/stable_diffusion_model.py @@ -674,14 +674,8 @@ class StableDiffusion: for idx in range(original_samples.shape[0]): - if scheduler_class_name not in index_noise_schedulers: - # convert to idx - noise_timesteps = [(self.noise_scheduler.timesteps == t).nonzero().item() for t in timesteps_chunks[idx]] - noise_timesteps = torch.tensor(noise_timesteps, device=self.device_torch) - else: - noise_timesteps = timesteps_chunks[idx] - # the add noise for ddpm solver is broken, do it ourselves + noise_timesteps = timesteps_chunks[idx] if scheduler_class_name == 'DPMSolverMultistepScheduler': # Make sure sigmas and timesteps have the same device and dtype as original_samples sigmas = self.noise_scheduler.sigmas.to(device=original_samples_chunks[idx].device, dtype=original_samples_chunks[idx].dtype)