From bd2bce9b925bd2e3560b9fdbfb0e9d72e053cff9 Mon Sep 17 00:00:00 2001
From: Jaret Burkett <jaretburkett@gmail.com>
Date: Wed, 29 Nov 2023 14:32:48 -0700
Subject: [PATCH] Switched to trailing timestep spacing to make timesteps for
 consistant across schedulers. Honed in on targeted guidance. It is finally
 perfect. (I think)

---
 extensions_built_in/sd_trainer/SDTrainer.py | 60 +++------------------
 toolkit/sampler.py                          | 12 ++---
 toolkit/stable_diffusion_model.py           |  8 +--
 3 files changed, 12 insertions(+), 68 deletions(-)

diff --git a/extensions_built_in/sd_trainer/SDTrainer.py b/extensions_built_in/sd_trainer/SDTrainer.py
index 18aba214..93bee42d 100644
--- a/extensions_built_in/sd_trainer/SDTrainer.py
+++ b/extensions_built_in/sd_trainer/SDTrainer.py
@@ -189,40 +189,15 @@ class SDTrainer(BaseSDTrainProcess):
     ):
         with torch.no_grad():
             # Perform targeted guidance (working title)
-            conditional_noisy_latents = noisy_latents  # target images
+            conditional_noisy_latents = noisy_latents.detach()  # target images
             dtype = get_torch_dtype(self.train_config.dtype)
 
             if batch.unconditional_latents is not None:
                 # unconditional latents are the "neutral" images. Add noise here identical to
                 # the noise added to the conditional latents, at the same timesteps
-                # unconditional_noisy_latents = self.sd.noise_scheduler.add_noise(
-                #     batch.unconditional_latents, noise, timesteps
-                # )
-                unconditional_noisy_latents = self.sd.add_noise(batch.unconditional_latents, noise, timesteps)
 
-            # calculate the differential between our conditional (target image) and out unconditional (neutral image)
-            target_differential_noise = unconditional_noisy_latents - conditional_noisy_latents
-            target_differential_noise = target_differential_noise.detach()
+                unconditional_noisy_latents = self.sd.add_noise(batch.unconditional_latents, noise, timesteps).detach()
 
-            # Calculate the mean along dim=1, keep dimensions
-            mean_chan = torch.abs(torch.mean(target_differential_noise, dim=1, keepdim=True))
-
-            # Create a mask with 0s where values are between 0.0 and 0.01, otherwise 1s
-            mask = torch.where((mean_chan >= 0.0) & (mean_chan <= 0.01), 0.0, 1.0)
-
-            # Duplicate the mask along dim 1 to match the shape of target_differential_noise
-            mask = mask.expand_as(target_differential_noise)
-            # this mask is now a 1 for our target differential and 0 for everything else
-
-            # add the target differential to the target latents as if it were noise with the scheduler, scaled to
-            # the current timestep. Scaling the noise here is important as it scales our guidance to the current
-            # timestep. This is the key to making the guidance work.
-            # guidance_latents = self.sd.noise_scheduler.add_noise(
-            #     conditional_noisy_latents,
-            #     target_differential_noise,
-            #     timesteps
-            # )
-            guidance_latents = self.sd.add_noise(conditional_noisy_latents, target_differential_noise, timesteps)
 
             # Disable the LoRA network so we can predict parent network knowledge without it
             self.network.is_active = False
@@ -231,7 +206,7 @@ class SDTrainer(BaseSDTrainProcess):
             # Predict noise to get a baseline of what the parent network wants to do with the latents + noise.
             # This acts as our control to preserve the unaltered parts of the image.
             baseline_prediction = self.sd.predict_noise(
-                latents=guidance_latents.to(self.device_torch, dtype=dtype).detach(),
+                latents=unconditional_noisy_latents.to(self.device_torch, dtype=dtype).detach(),
                 conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype).detach(),
                 timestep=timesteps,
                 guidance_scale=1.0,
@@ -245,42 +220,19 @@ class SDTrainer(BaseSDTrainProcess):
 
         # do our prediction with LoRA active on the scaled guidance latents
         prediction = self.sd.predict_noise(
-            latents=guidance_latents.to(self.device_torch, dtype=dtype).detach(),
+            latents=conditional_noisy_latents.to(self.device_torch, dtype=dtype).detach(),
             conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype).detach(),
             timestep=timesteps,
             guidance_scale=1.0,
             **pred_kwargs  # adapter residuals in here
         )
 
-        # remove the baseline prediction from our prediction to get the differential between the two
-        # all that should be left is the differential between the conditional and unconditional images
-        pred_differential_noise = prediction - baseline_prediction
-
-        # for loss, we target ONLY the unscaled differential between our conditional and unconditional latents
-        # not the timestep scaled noise that was added. This is the diffusion training process.
-        # This will guide the network to make identical predictions it previously did for everything EXCEPT our
-        # differential between the conditional and unconditional images (target)
         loss = torch.nn.functional.mse_loss(
-            pred_differential_noise.float(),
-            target_differential_noise.float(),
-            reduction="none"
-        )
-
-        # multiply by our mask
-        loss = loss * mask
-        loss = loss.mean([1, 2, 3])
-        # calculate inverse to match baseline prediction
-        unmasked_prior_loss = torch.nn.functional.mse_loss(
-            baseline_prediction.float(),
             prediction.float(),
+            baseline_prediction.float(),
             reduction="none"
         )
-        # multiply by our mask
-        unmasked_prior_loss = unmasked_prior_loss * (1.0 - mask)
-        # add the unmasked prior loss to the masked loss
-        unmasked_prior_loss = unmasked_prior_loss.mean([1, 2, 3])
-        loss = loss + unmasked_prior_loss
-
+        loss = loss.mean([1, 2, 3])
 
         loss = self.apply_snr(loss, timesteps)
         loss = loss.mean()
diff --git a/toolkit/sampler.py b/toolkit/sampler.py
index b2ba6646..6ccc621b 100644
--- a/toolkit/sampler.py
+++ b/toolkit/sampler.py
@@ -26,8 +26,8 @@ SCHEDULER_TIMESTEPS = 1000
 SCHEDLER_SCHEDULE = "scaled_linear"
 
 sdxl_sampler_config = {
-    "_class_name": "EulerDiscreteScheduler",
-    "_diffusers_version": "0.19.0.dev0",
+    "_class_name": "EulerAncestralDiscreteScheduler",
+    "_diffusers_version": "0.24.0.dev0",
     "beta_end": 0.012,
     "beta_schedule": "scaled_linear",
     "beta_start": 0.00085,
@@ -37,11 +37,10 @@ sdxl_sampler_config = {
     "prediction_type": "epsilon",
     "sample_max_value": 1.0,
     "set_alpha_to_one": False,
-    "skip_prk_steps": True,
+    "skip_prk_steps": False,
     "steps_offset": 1,
-    "timestep_spacing": "leading",
-    "trained_betas": None,
-    "use_karras_sigmas": False
+    "timestep_spacing": "trailing",
+    "trained_betas": None
 }
 
 
@@ -86,7 +85,6 @@ def get_sampler(
 
     scheduler = scheduler_cls.from_config(config)
 
-
     return scheduler
 
 
diff --git a/toolkit/stable_diffusion_model.py b/toolkit/stable_diffusion_model.py
index a608a02b..c2701bb3 100644
--- a/toolkit/stable_diffusion_model.py
+++ b/toolkit/stable_diffusion_model.py
@@ -674,14 +674,8 @@ class StableDiffusion:
 
         for idx in range(original_samples.shape[0]):
 
-            if scheduler_class_name not in index_noise_schedulers:
-                # convert to idx
-                noise_timesteps = [(self.noise_scheduler.timesteps == t).nonzero().item() for t in timesteps_chunks[idx]]
-                noise_timesteps = torch.tensor(noise_timesteps, device=self.device_torch)
-            else:
-                noise_timesteps = timesteps_chunks[idx]
-
             # the add noise for ddpm solver is broken, do it ourselves
+            noise_timesteps = timesteps_chunks[idx]
             if scheduler_class_name == 'DPMSolverMultistepScheduler':
                 # Make sure sigmas and timesteps have the same device and dtype as original_samples
                 sigmas = self.noise_scheduler.sigmas.to(device=original_samples_chunks[idx].device, dtype=original_samples_chunks[idx].dtype)